AI Chatbot Architecture: From Prototype to Production
·13 min read
A prototype chatbot is 10 lines of code. A production chatbot handles thousands of concurrent users, maintains conversation state, integrates with your data, and degrades gracefully under load. This guide covers the architecture gap between the two.
Architecture overview
// Production chatbot layers: // // 1. Channel layer: Web, mobile, Slack, WhatsApp, voice // 2. Gateway layer: Auth, rate limiting, session routing // 3. Orchestration: Context assembly, tool routing, LLM selection // 4. Memory layer: Session state, long-term user memory, entity store // 5. Knowledge layer: RAG vector store, knowledge base, product catalog // 6. LLM layer: Model selection, fallback, caching // 7. Integration layer: CRM, ticketing, database, external APIs
Session and state management
// Each conversation = a session with its own state
interface ChatSession {
sessionId: string;
userId: string;
channel: 'web' | 'mobile' | 'slack' | 'api';
messages: Message[];
summary?: string; // compressed history
userContext: UserContext; // user profile, preferences
entityMemory: Record<string, string>; // extracted facts
createdAt: Date;
lastActiveAt: Date;
}
// Redis for active sessions (fast, ephemeral)
async function getOrCreateSession(sessionId: string, userId: string): Promise<ChatSession> {
const key = `session:${sessionId}`;
const cached = await redis.get<ChatSession>(key);
if (cached) {
await redis.expire(key, 3600); // extend TTL on access
return cached;
}
const session: ChatSession = {
sessionId,
userId,
channel: 'web',
messages: [],
userContext: await loadUserContext(userId),
entityMemory: {},
createdAt: new Date(),
lastActiveAt: new Date(),
};
await redis.setex(key, 3600, JSON.stringify(session));
return session;
}
async function saveSession(session: ChatSession) {
session.lastActiveAt = new Date();
await redis.setex(`session:${session.sessionId}`, 3600, JSON.stringify(session));
// Persist to database for long-term history
await db.chatSessions.upsert({
where: { sessionId: session.sessionId },
update: { messages: session.messages, lastActiveAt: session.lastActiveAt },
create: session,
});
}Context assembly pipeline
async function assembleContext(session: ChatSession, userMessage: string): Promise<Message[]> {
// 1. System prompt with user context injection
const systemPrompt = buildSystemPrompt(session.userContext);
// 2. Retrieve relevant knowledge (RAG)
const relevantDocs = await retrieveRelevantDocs(userMessage, session.userId);
// 3. Inject entity memory
const entityContext = Object.entries(session.entityMemory)
.map(([k, v]) => `${k}: ${v}`)
.join('
');
// 4. Assemble system message
const systemMessage: Message = {
role: 'system',
content: `${systemPrompt}
${relevantDocs.length ? `Relevant knowledge:
${relevantDocs.join('
')}` : ''}
${entityContext ? `Known user facts:
${entityContext}` : ''}`.trim(),
};
// 5. Build message history (sliding window)
const history = slidingWindow(session.messages, 60_000);
return [systemMessage, ...history, { role: 'user', content: userMessage }];
}Entity extraction from conversation
// Extract and store facts mentioned in conversation
async function extractEntities(
userMessage: string,
session: ChatSession
): Promise<Record<string, string>> {
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: `Extract any new user facts from this message. Return JSON with fact key-value pairs.
If no new facts, return {}.
Existing facts: ${JSON.stringify(session.entityMemory)}
New message: "${userMessage}"`,
}],
response_format: { type: 'json_object' },
max_tokens: 200,
});
return JSON.parse(response.choices[0].message.content!);
}
// Example extractions:
// "My name is Alice" → { name: "Alice" }
// "I'm on the Pro plan" → { plan: "Pro" }
// "I'm based in Tokyo" → { location: "Tokyo" }Intent routing
// Route different intents to specialized handlers
type Intent = 'billing' | 'technical' | 'cancellation' | 'general';
async function classifyIntent(message: string): Promise<Intent> {
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: `Classify this message intent. Reply with one word: billing, technical, cancellation, or general.
Message: "${message}"`,
}],
max_tokens: 5,
temperature: 0,
});
return response.choices[0].message.content!.trim().toLowerCase() as Intent;
}
const intentHandlers: Record<Intent, (session: ChatSession, msg: string) => Promise<string>> = {
billing: handleBillingQuery,
technical: handleTechnicalQuery,
cancellation: handleCancellationFlow,
general: handleGeneralQuery,
};
// Route based on intent
const intent = await classifyIntent(userMessage);
const response = await intentHandlers[intent](session, userMessage);Rate limiting per user
// Use a sliding window rate limiter
const LIMITS = {
messagesPerMinute: 10,
messagesPerHour: 100,
tokensPerDay: 500_000,
};
async function checkRateLimit(userId: string): Promise<{ allowed: boolean; retryAfter?: number }> {
const now = Date.now();
const minuteKey = `rl:min:${userId}:${Math.floor(now / 60_000)}`;
const hourKey = `rl:hour:${userId}:${Math.floor(now / 3_600_000)}`;
const [perMin, perHour] = await redis.mget<number>(minuteKey, hourKey);
if ((perMin ?? 0) >= LIMITS.messagesPerMinute) {
return { allowed: false, retryAfter: 60 - Math.floor((now % 60_000) / 1000) };
}
if ((perHour ?? 0) >= LIMITS.messagesPerHour) {
return { allowed: false, retryAfter: 3600 - Math.floor((now % 3_600_000) / 1000) };
}
await redis.incr(minuteKey);
await redis.expire(minuteKey, 60);
await redis.incr(hourKey);
await redis.expire(hourKey, 3600);
return { allowed: true };
}Fallback and graceful degradation
// Graceful degradation layers
async function generateResponse(context: Message[]): Promise<string> {
// Layer 1: Primary model (gpt-4o)
try {
return await callOpenAI('gpt-4o', context, 1024);
} catch (err) {
logger.warn('Primary model failed, trying fallback', err);
}
// Layer 2: Fallback model (gpt-4o-mini)
try {
return await callOpenAI('gpt-4o-mini', context, 1024);
} catch (err) {
logger.warn('Fallback model failed, trying Claude', err);
}
// Layer 3: Alternative provider (Claude)
try {
return await callClaude('claude-haiku-4-5', context, 1024);
} catch (err) {
logger.error('All LLM providers failed', err);
}
// Layer 4: Static fallback
return "I'm having trouble processing your request right now. Please try again in a moment or contact support.";
}Production checklist
- Store sessions in Redis with 1-hour TTL; persist to DB for history.
- Implement per-user rate limiting (messages/minute, tokens/day).
- Classify intents before assembling context to select the right knowledge base.
- Extract entities from every conversation for personalization.
- Use sliding window + summarization to manage context length.
- Implement multi-provider fallback for reliability (OpenAI → Anthropic).
- Log every turn with sessionId, userId, latency, tokens, and model.
- Add webhook delivery for async channels (Slack, email, SMS).
Takeaway
The hardest part of production chatbots is not the LLM call — it is session state, context assembly, and graceful degradation. Build the session layer first, then add RAG, intent routing, and multi-provider fallback incrementally.