LLM Router and Gateway: Provider Switching, Fallbacks, and Cost Control

An LLM gateway sits between your application and LLM providers. It handles routing, fallbacks, caching, rate limiting, and observability — letting you switch models or providers without changing application code.

Why build a gateway?

Problem	Gateway solution
Provider outages	Automatic failover to backup provider
Cost spikes	Route cheap queries to cheaper models
Repeated prompts	Semantic cache at gateway level
Rate limits	Queue and retry with backoff
No observability	Central logging for all LLM calls
Provider lock-in	Unified interface across OpenAI/Anthropic/Gemini

Unified LLM interface

// Normalize different provider APIs to a single interface
interface LLMRequest {
  messages: { role: 'system' | 'user' | 'assistant'; content: string }[];
  maxTokens?: number;
  temperature?: number;
  responseFormat?: 'text' | 'json';
}

interface LLMResponse {
  content: string;
  model: string;
  provider: string;
  inputTokens: number;
  outputTokens: number;
  latencyMs: number;
  cached: boolean;
}

// Provider adapters
async function callOpenAI(req: LLMRequest, model: string): Promise<LLMResponse> {
  const start = Date.now();
  const response = await openai.chat.completions.create({
    model,
    messages: req.messages,
    max_tokens: req.maxTokens,
    temperature: req.temperature,
    response_format: req.responseFormat === 'json' ? { type: 'json_object' } : undefined,
  });
  return {
    content:      response.choices[0].message.content!,
    model:        response.model,
    provider:     'openai',
    inputTokens:  response.usage!.prompt_tokens,
    outputTokens: response.usage!.completion_tokens,
    latencyMs:    Date.now() - start,
    cached:       false,
  };
}

async function callAnthropic(req: LLMRequest, model: string): Promise<LLMResponse> {
  const start = Date.now();
  const system = req.messages.find(m => m.role === 'system')?.content;
  const messages = req.messages.filter(m => m.role !== 'system');

  const response = await anthropic.messages.create({
    model,
    max_tokens: req.maxTokens ?? 1024,
    system,
    messages: messages as Anthropic.MessageParam[],
  });

  const textBlock = response.content.find(b => b.type === 'text');
  return {
    content:      textBlock?.type === 'text' ? textBlock.text : '',
    model:        response.model,
    provider:     'anthropic',
    inputTokens:  response.usage.input_tokens,
    outputTokens: response.usage.output_tokens,
    latencyMs:    Date.now() - start,
    cached:       false,
  };
}

Routing logic

interface RoutingRule {
  condition: (req: LLMRequest, meta: RequestMeta) => boolean;
  model: string;
  provider: 'openai' | 'anthropic' | 'gemini';
  reason: string;
}

const routingRules: RoutingRule[] = [
  // Long documents → Claude (larger context, better recall)
  {
    condition: (req) => countTokens(req.messages) > 50_000,
    model: 'claude-sonnet-4-5',
    provider: 'anthropic',
    reason: 'long-context',
  },
  // Simple classification → cheap model
  {
    condition: (_, meta) => meta.feature === 'classification',
    model: 'gpt-4o-mini',
    provider: 'openai',
    reason: 'cost-optimization',
  },
  // Code generation → GPT-4o
  {
    condition: (_, meta) => meta.feature === 'code-generation',
    model: 'gpt-4o',
    provider: 'openai',
    reason: 'code-quality',
  },
  // Default
  {
    condition: () => true,
    model: 'gpt-4o-mini',
    provider: 'openai',
    reason: 'default',
  },
];

function selectModel(req: LLMRequest, meta: RequestMeta): { model: string; provider: string; reason: string } {
  const rule = routingRules.find(r => r.condition(req, meta));
  return { model: rule!.model, provider: rule!.provider, reason: rule!.reason };
}

Fallback chain

const FALLBACK_CHAIN: Array<{ model: string; provider: 'openai' | 'anthropic' }> = [
  { model: 'gpt-4o',           provider: 'openai' },
  { model: 'gpt-4o-mini',      provider: 'openai' },
  { model: 'claude-sonnet-4-5', provider: 'anthropic' },
  { model: 'claude-haiku-4-5',  provider: 'anthropic' },
];

async function callWithFallback(req: LLMRequest, meta: RequestMeta): Promise<LLMResponse> {
  // Start at the routed model
  const { model: primaryModel, provider: primaryProvider } = selectModel(req, meta);
  const startIndex = FALLBACK_CHAIN.findIndex(f => f.model === primaryModel);
  const chain = FALLBACK_CHAIN.slice(startIndex);

  for (const { model, provider } of chain) {
    try {
      const response = provider === 'openai'
        ? await callOpenAI(req, model)
        : await callAnthropic(req, model);
      logger.info({ event: 'llm_success', model, provider, latencyMs: response.latencyMs });
      return response;
    } catch (err) {
      logger.warn({ event: 'llm_failure', model, provider, error: String(err) });
      // Continue to next in chain
    }
  }

  throw new Error('All LLM providers failed');
}

Semantic caching at gateway

async function cachedCall(req: LLMRequest, meta: RequestMeta): Promise<LLMResponse> {
  // Generate semantic cache key from last user message
  const lastUserMessage = [...req.messages].reverse().find(m => m.role === 'user')?.content ?? '';
  const queryEmbedding = await embedText(lastUserMessage);

  // Check semantic cache (cosine similarity >= 0.97)
  const cacheHit = await semanticCache.findSimilar(queryEmbedding, 0.97);
  if (cacheHit) {
    return { ...cacheHit.response, cached: true, latencyMs: 0 };
  }

  // Cache miss — call LLM
  const response = await callWithFallback(req, meta);

  // Store in semantic cache
  await semanticCache.store(queryEmbedding, response, ttl: 3600);
  return response;
}

Gateway as an Express middleware

import express from 'express';
import { authMiddleware } from './auth';
import { rateLimitMiddleware } from './rate-limit';

const app = express();
app.use(express.json());
app.use(authMiddleware);
app.use(rateLimitMiddleware);

app.post('/v1/chat', async (req, res) => {
  const { messages, maxTokens, feature } = req.body;
  const userId = req.user.id;

  try {
    const response = await cachedCall(
      { messages, maxTokens },
      { feature, userId }
    );

    // Log for observability
    await logLLMCall({ userId, feature, ...response });

    res.json({
      content:      response.content,
      model:        response.model,
      provider:     response.provider,
      cached:       response.cached,
      usage: {
        inputTokens:  response.inputTokens,
        outputTokens: response.outputTokens,
      },
    });
  } catch (err) {
    logger.error({ event: 'gateway_error', userId, feature, error: String(err) });
    res.status(503).json({ error: 'AI service temporarily unavailable' });
  }
});

app.listen(4000, () => console.log('LLM Gateway running on :4000'));

Managed LLM gateway options

Service	Highlights
LiteLLM	Open-source, 100+ model providers, OpenAI-compatible API
Portkey	Managed, semantic cache, fallbacks, observability
Helicone	Observability-focused, proxy-based, free tier
OpenRouter	Unified billing across providers, large model selection
AWS Bedrock	AWS-native, IAM auth, Claude + Titan + Llama

Takeaway

Start with LiteLLM for a self-hosted OpenAI-compatible proxy with multi-provider support. Build custom routing logic only when you need fine-grained control over model selection by feature type. The semantic cache at the gateway level is the single highest-ROI component for cost reduction at scale.