AI Streaming Guide: SSE, ReadableStream, and Edge Deployment for LLM Apps
·12 min read
Streaming is the difference between a UI that feels instant and one that forces users to wait 8 seconds for a response. This guide covers every layer of LLM streaming from API to browser.
Why streaming matters
| Metric | Non-streaming | Streaming |
|---|---|---|
| Time to first token | 3–8 seconds | <500ms |
| Perceived responsiveness | Low | High |
| Abandonment rate | High (>8s) | Low (immediate feedback) |
| Server memory per connection | Higher (buffered) | Lower (chunked) |
OpenAI streaming basics
import OpenAI from 'openai';
const openai = new OpenAI();
// Stream to stdout
const stream = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{ role: 'user', content: 'Explain quantum entanglement simply.' }],
stream: true,
});
for await (const chunk of stream) {
const text = chunk.choices[0]?.delta?.content ?? '';
process.stdout.write(text);
}
// Get final usage stats after streaming
const finalChunk = await stream.finalChatCompletion();
console.log('\nTokens used:', finalChunk.usage?.total_tokens);Next.js App Router: streaming API route
// app/api/chat/route.ts
import OpenAI from 'openai';
import { NextRequest } from 'next/server';
const openai = new OpenAI();
export const runtime = 'edge'; // deploy to edge for lowest latency
export async function POST(req: NextRequest) {
const { messages } = await req.json();
const stream = await openai.chat.completions.create({
model: 'gpt-4o',
messages,
stream: true,
max_tokens: 1024,
});
const encoder = new TextEncoder();
return new Response(
new ReadableStream({
async start(controller) {
try {
for await (const chunk of stream) {
const text = chunk.choices[0]?.delta?.content ?? '';
if (text) {
controller.enqueue(encoder.encode(`data: ${JSON.stringify({ text })}
`));
}
}
controller.enqueue(encoder.encode('data: [DONE]
'));
} finally {
controller.close();
}
},
}),
{
headers: {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache, no-transform',
'X-Accel-Buffering': 'no', // disable nginx buffering
},
}
);
}Client-side SSE consumption (React)
'use client';
import { useState, useCallback } from 'react';
export function ChatInterface() {
const [response, setResponse] = useState('');
const [isStreaming, setIsStreaming] = useState(false);
const sendMessage = useCallback(async (userMessage: string) => {
setResponse('');
setIsStreaming(true);
const res = await fetch('/api/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ messages: [{ role: 'user', content: userMessage }] }),
});
if (!res.body) return;
const reader = res.body.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const lines = decoder.decode(value).split('
');
for (const line of lines) {
if (!line.startsWith('data: ')) continue;
const data = line.slice(6);
if (data === '[DONE]') { setIsStreaming(false); break; }
try {
const { text } = JSON.parse(data);
setResponse(prev => prev + text);
} catch { /* skip malformed chunks */ }
}
}
setIsStreaming(false);
}, []);
return (
<div>
<button onClick={() => sendMessage('Tell me a joke')}>Ask</button>
<p>{response}{isStreaming && <span className="animate-pulse">▋</span>}</p>
</div>
);
}Vercel AI SDK (recommended abstraction)
npm install ai @ai-sdk/openai
// app/api/chat/route.ts
import { openai } from '@ai-sdk/openai';
import { streamText } from 'ai';
export const runtime = 'edge';
export async function POST(req: Request) {
const { messages } = await req.json();
const result = streamText({
model: openai('gpt-4o'),
messages,
maxTokens: 1024,
});
return result.toDataStreamResponse();
}
// app/page.tsx — client component
'use client';
import { useChat } from 'ai/react';
export default function ChatPage() {
const { messages, input, handleInputChange, handleSubmit, isLoading } = useChat();
return (
<div>
{messages.map(m => (
<div key={m.id}><strong>{m.role}:</strong> {m.content}</div>
))}
<form onSubmit={handleSubmit}>
<input value={input} onChange={handleInputChange} placeholder="Ask anything..." />
<button type="submit" disabled={isLoading}>Send</button>
</form>
</div>
);
}Stream with tool calls
// Tool calls interrupt the text stream — handle both
const stream = await openai.chat.completions.create({
model: 'gpt-4o',
messages,
tools,
stream: true,
});
let toolCallBuffer: Record<string, { name: string; args: string }> = {};
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta;
// Text token
if (delta?.content) {
process.stdout.write(delta.content);
}
// Tool call accumulation (args arrive in fragments)
if (delta?.tool_calls) {
for (const tc of delta.tool_calls) {
const id = tc.id ?? Object.keys(toolCallBuffer).at(-1)!;
if (!toolCallBuffer[id]) {
toolCallBuffer[id] = { name: tc.function?.name ?? '', args: '' };
}
toolCallBuffer[id].args += tc.function?.arguments ?? '';
}
}
if (chunk.choices[0]?.finish_reason === 'tool_calls') {
for (const [id, { name, args }] of Object.entries(toolCallBuffer)) {
const result = await executeTool(name, JSON.parse(args));
messages.push({ role: 'tool', tool_call_id: id, content: JSON.stringify(result) });
}
// Continue with second pass after tool execution
toolCallBuffer = {};
}
}Abort / cancellation
// Client: cancel in-flight stream
const controller = new AbortController();
const res = await fetch('/api/chat', {
method: 'POST',
body: JSON.stringify({ messages }),
signal: controller.signal, // attach abort signal
});
// Cancel when user clicks stop
stopButton.onclick = () => controller.abort();
// Server: detect abort and clean up
export async function POST(req: NextRequest) {
const stream = await openai.chat.completions.create({ ... });
return new Response(
new ReadableStream({
async start(controller) {
req.signal.addEventListener('abort', () => {
stream.controller.abort();
controller.close();
});
for await (const chunk of stream) {
if (req.signal.aborted) break;
// ... enqueue chunks
}
},
})
);
}Edge deployment checklist
- Set
export const runtime = 'edge'in every streaming route. - Disable proxy buffering:
X-Accel-Buffering: noheader. - Set
Cache-Control: no-cache, no-transformto prevent CDN buffering. - Handle abort signals on both client and server to avoid orphaned LLM calls.
- Cap
max_tokens— unbounded streams can run for minutes. - Add a connection timeout (30–60s) at the load balancer level.
Takeaway
Use the Vercel AI SDK for new projects — it handles SSE, streaming tool calls, and React hooks with minimal boilerplate. Roll your own only if you need fine-grained control over the SSE protocol or a non-React frontend.