LLM Router and Gateway: Provider Switching, Fallbacks, and Cost Control
·12 min read
An LLM gateway sits between your application and LLM providers. It handles routing, fallbacks, caching, rate limiting, and observability — letting you switch models or providers without changing application code.
Why build a gateway?
| Problem | Gateway solution |
|---|---|
| Provider outages | Automatic failover to backup provider |
| Cost spikes | Route cheap queries to cheaper models |
| Repeated prompts | Semantic cache at gateway level |
| Rate limits | Queue and retry with backoff |
| No observability | Central logging for all LLM calls |
| Provider lock-in | Unified interface across OpenAI/Anthropic/Gemini |
Unified LLM interface
// Normalize different provider APIs to a single interface
interface LLMRequest {
messages: { role: 'system' | 'user' | 'assistant'; content: string }[];
maxTokens?: number;
temperature?: number;
responseFormat?: 'text' | 'json';
}
interface LLMResponse {
content: string;
model: string;
provider: string;
inputTokens: number;
outputTokens: number;
latencyMs: number;
cached: boolean;
}
// Provider adapters
async function callOpenAI(req: LLMRequest, model: string): Promise<LLMResponse> {
const start = Date.now();
const response = await openai.chat.completions.create({
model,
messages: req.messages,
max_tokens: req.maxTokens,
temperature: req.temperature,
response_format: req.responseFormat === 'json' ? { type: 'json_object' } : undefined,
});
return {
content: response.choices[0].message.content!,
model: response.model,
provider: 'openai',
inputTokens: response.usage!.prompt_tokens,
outputTokens: response.usage!.completion_tokens,
latencyMs: Date.now() - start,
cached: false,
};
}
async function callAnthropic(req: LLMRequest, model: string): Promise<LLMResponse> {
const start = Date.now();
const system = req.messages.find(m => m.role === 'system')?.content;
const messages = req.messages.filter(m => m.role !== 'system');
const response = await anthropic.messages.create({
model,
max_tokens: req.maxTokens ?? 1024,
system,
messages: messages as Anthropic.MessageParam[],
});
const textBlock = response.content.find(b => b.type === 'text');
return {
content: textBlock?.type === 'text' ? textBlock.text : '',
model: response.model,
provider: 'anthropic',
inputTokens: response.usage.input_tokens,
outputTokens: response.usage.output_tokens,
latencyMs: Date.now() - start,
cached: false,
};
}Routing logic
interface RoutingRule {
condition: (req: LLMRequest, meta: RequestMeta) => boolean;
model: string;
provider: 'openai' | 'anthropic' | 'gemini';
reason: string;
}
const routingRules: RoutingRule[] = [
// Long documents → Claude (larger context, better recall)
{
condition: (req) => countTokens(req.messages) > 50_000,
model: 'claude-sonnet-4-5',
provider: 'anthropic',
reason: 'long-context',
},
// Simple classification → cheap model
{
condition: (_, meta) => meta.feature === 'classification',
model: 'gpt-4o-mini',
provider: 'openai',
reason: 'cost-optimization',
},
// Code generation → GPT-4o
{
condition: (_, meta) => meta.feature === 'code-generation',
model: 'gpt-4o',
provider: 'openai',
reason: 'code-quality',
},
// Default
{
condition: () => true,
model: 'gpt-4o-mini',
provider: 'openai',
reason: 'default',
},
];
function selectModel(req: LLMRequest, meta: RequestMeta): { model: string; provider: string; reason: string } {
const rule = routingRules.find(r => r.condition(req, meta));
return { model: rule!.model, provider: rule!.provider, reason: rule!.reason };
}Fallback chain
const FALLBACK_CHAIN: Array<{ model: string; provider: 'openai' | 'anthropic' }> = [
{ model: 'gpt-4o', provider: 'openai' },
{ model: 'gpt-4o-mini', provider: 'openai' },
{ model: 'claude-sonnet-4-5', provider: 'anthropic' },
{ model: 'claude-haiku-4-5', provider: 'anthropic' },
];
async function callWithFallback(req: LLMRequest, meta: RequestMeta): Promise<LLMResponse> {
// Start at the routed model
const { model: primaryModel, provider: primaryProvider } = selectModel(req, meta);
const startIndex = FALLBACK_CHAIN.findIndex(f => f.model === primaryModel);
const chain = FALLBACK_CHAIN.slice(startIndex);
for (const { model, provider } of chain) {
try {
const response = provider === 'openai'
? await callOpenAI(req, model)
: await callAnthropic(req, model);
logger.info({ event: 'llm_success', model, provider, latencyMs: response.latencyMs });
return response;
} catch (err) {
logger.warn({ event: 'llm_failure', model, provider, error: String(err) });
// Continue to next in chain
}
}
throw new Error('All LLM providers failed');
}Semantic caching at gateway
async function cachedCall(req: LLMRequest, meta: RequestMeta): Promise<LLMResponse> {
// Generate semantic cache key from last user message
const lastUserMessage = [...req.messages].reverse().find(m => m.role === 'user')?.content ?? '';
const queryEmbedding = await embedText(lastUserMessage);
// Check semantic cache (cosine similarity >= 0.97)
const cacheHit = await semanticCache.findSimilar(queryEmbedding, 0.97);
if (cacheHit) {
return { ...cacheHit.response, cached: true, latencyMs: 0 };
}
// Cache miss — call LLM
const response = await callWithFallback(req, meta);
// Store in semantic cache
await semanticCache.store(queryEmbedding, response, ttl: 3600);
return response;
}Gateway as an Express middleware
import express from 'express';
import { authMiddleware } from './auth';
import { rateLimitMiddleware } from './rate-limit';
const app = express();
app.use(express.json());
app.use(authMiddleware);
app.use(rateLimitMiddleware);
app.post('/v1/chat', async (req, res) => {
const { messages, maxTokens, feature } = req.body;
const userId = req.user.id;
try {
const response = await cachedCall(
{ messages, maxTokens },
{ feature, userId }
);
// Log for observability
await logLLMCall({ userId, feature, ...response });
res.json({
content: response.content,
model: response.model,
provider: response.provider,
cached: response.cached,
usage: {
inputTokens: response.inputTokens,
outputTokens: response.outputTokens,
},
});
} catch (err) {
logger.error({ event: 'gateway_error', userId, feature, error: String(err) });
res.status(503).json({ error: 'AI service temporarily unavailable' });
}
});
app.listen(4000, () => console.log('LLM Gateway running on :4000'));Managed LLM gateway options
| Service | Highlights |
|---|---|
| LiteLLM | Open-source, 100+ model providers, OpenAI-compatible API |
| Portkey | Managed, semantic cache, fallbacks, observability |
| Helicone | Observability-focused, proxy-based, free tier |
| OpenRouter | Unified billing across providers, large model selection |
| AWS Bedrock | AWS-native, IAM auth, Claude + Titan + Llama |
Takeaway
Start with LiteLLM for a self-hosted OpenAI-compatible proxy with multi-provider support. Build custom routing logic only when you need fine-grained control over model selection by feature type. The semantic cache at the gateway level is the single highest-ROI component for cost reduction at scale.