AI Streaming Guide: SSE, ReadableStream, and Edge Deployment for LLM Apps

Streaming is the difference between a UI that feels instant and one that forces users to wait 8 seconds for a response. This guide covers every layer of LLM streaming from API to browser.

Why streaming matters

Metric	Non-streaming	Streaming
Time to first token	3–8 seconds	<500ms
Perceived responsiveness	Low	High
Abandonment rate	High (>8s)	Low (immediate feedback)
Server memory per connection	Higher (buffered)	Lower (chunked)

OpenAI streaming basics

import OpenAI from 'openai';

const openai = new OpenAI();

// Stream to stdout
const stream = await openai.chat.completions.create({
  model: 'gpt-4o',
  messages: [{ role: 'user', content: 'Explain quantum entanglement simply.' }],
  stream: true,
});

for await (const chunk of stream) {
  const text = chunk.choices[0]?.delta?.content ?? '';
  process.stdout.write(text);
}

// Get final usage stats after streaming
const finalChunk = await stream.finalChatCompletion();
console.log('\nTokens used:', finalChunk.usage?.total_tokens);

Next.js App Router: streaming API route

// app/api/chat/route.ts
import OpenAI from 'openai';
import { NextRequest } from 'next/server';

const openai = new OpenAI();

export const runtime = 'edge';  // deploy to edge for lowest latency

export async function POST(req: NextRequest) {
  const { messages } = await req.json();

  const stream = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages,
    stream: true,
    max_tokens: 1024,
  });

  const encoder = new TextEncoder();

  return new Response(
    new ReadableStream({
      async start(controller) {
        try {
          for await (const chunk of stream) {
            const text = chunk.choices[0]?.delta?.content ?? '';
            if (text) {
              controller.enqueue(encoder.encode(`data: ${JSON.stringify({ text })}

`));
            }
          }
          controller.enqueue(encoder.encode('data: [DONE]

'));
        } finally {
          controller.close();
        }
      },
    }),
    {
      headers: {
        'Content-Type': 'text/event-stream',
        'Cache-Control': 'no-cache, no-transform',
        'X-Accel-Buffering': 'no',  // disable nginx buffering
      },
    }
  );
}

Client-side SSE consumption (React)

'use client';
import { useState, useCallback } from 'react';

export function ChatInterface() {
  const [response, setResponse] = useState('');
  const [isStreaming, setIsStreaming] = useState(false);

  const sendMessage = useCallback(async (userMessage: string) => {
    setResponse('');
    setIsStreaming(true);

    const res = await fetch('/api/chat', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ messages: [{ role: 'user', content: userMessage }] }),
    });

    if (!res.body) return;

    const reader = res.body.getReader();
    const decoder = new TextDecoder();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const lines = decoder.decode(value).split('
');
      for (const line of lines) {
        if (!line.startsWith('data: ')) continue;
        const data = line.slice(6);
        if (data === '[DONE]') { setIsStreaming(false); break; }

        try {
          const { text } = JSON.parse(data);
          setResponse(prev => prev + text);
        } catch { /* skip malformed chunks */ }
      }
    }

    setIsStreaming(false);
  }, []);

  return (
    <div>
      <button onClick={() => sendMessage('Tell me a joke')}>Ask</button>
      <p>{response}{isStreaming && <span className="animate-pulse">▋</span>}</p>
    </div>
  );
}

Vercel AI SDK (recommended abstraction)

npm install ai @ai-sdk/openai

// app/api/chat/route.ts
import { openai } from '@ai-sdk/openai';
import { streamText } from 'ai';

export const runtime = 'edge';

export async function POST(req: Request) {
  const { messages } = await req.json();

  const result = streamText({
    model: openai('gpt-4o'),
    messages,
    maxTokens: 1024,
  });

  return result.toDataStreamResponse();
}

// app/page.tsx — client component
'use client';
import { useChat } from 'ai/react';

export default function ChatPage() {
  const { messages, input, handleInputChange, handleSubmit, isLoading } = useChat();

  return (
    <div>
      {messages.map(m => (
        <div key={m.id}><strong>{m.role}:</strong> {m.content}</div>
      ))}
      <form onSubmit={handleSubmit}>
        <input value={input} onChange={handleInputChange} placeholder="Ask anything..." />
        <button type="submit" disabled={isLoading}>Send</button>
      </form>
    </div>
  );
}

Stream with tool calls

// Tool calls interrupt the text stream — handle both
const stream = await openai.chat.completions.create({
  model: 'gpt-4o',
  messages,
  tools,
  stream: true,
});

let toolCallBuffer: Record<string, { name: string; args: string }> = {};

for await (const chunk of stream) {
  const delta = chunk.choices[0]?.delta;

  // Text token
  if (delta?.content) {
    process.stdout.write(delta.content);
  }

  // Tool call accumulation (args arrive in fragments)
  if (delta?.tool_calls) {
    for (const tc of delta.tool_calls) {
      const id = tc.id ?? Object.keys(toolCallBuffer).at(-1)!;
      if (!toolCallBuffer[id]) {
        toolCallBuffer[id] = { name: tc.function?.name ?? '', args: '' };
      }
      toolCallBuffer[id].args += tc.function?.arguments ?? '';
    }
  }

  if (chunk.choices[0]?.finish_reason === 'tool_calls') {
    for (const [id, { name, args }] of Object.entries(toolCallBuffer)) {
      const result = await executeTool(name, JSON.parse(args));
      messages.push({ role: 'tool', tool_call_id: id, content: JSON.stringify(result) });
    }
    // Continue with second pass after tool execution
    toolCallBuffer = {};
  }
}

Abort / cancellation

// Client: cancel in-flight stream
const controller = new AbortController();

const res = await fetch('/api/chat', {
  method: 'POST',
  body: JSON.stringify({ messages }),
  signal: controller.signal,  // attach abort signal
});

// Cancel when user clicks stop
stopButton.onclick = () => controller.abort();

// Server: detect abort and clean up
export async function POST(req: NextRequest) {
  const stream = await openai.chat.completions.create({ ... });

  return new Response(
    new ReadableStream({
      async start(controller) {
        req.signal.addEventListener('abort', () => {
          stream.controller.abort();
          controller.close();
        });

        for await (const chunk of stream) {
          if (req.signal.aborted) break;
          // ... enqueue chunks
        }
      },
    })
  );
}

Edge deployment checklist

Set export const runtime = 'edge' in every streaming route.
Disable proxy buffering: X-Accel-Buffering: no header.
Set Cache-Control: no-cache, no-transform to prevent CDN buffering.
Handle abort signals on both client and server to avoid orphaned LLM calls.
Cap max_tokens — unbounded streams can run for minutes.
Add a connection timeout (30–60s) at the load balancer level.

Takeaway

Use the Vercel AI SDK for new projects — it handles SSE, streaming tool calls, and React hooks with minimal boilerplate. Roll your own only if you need fine-grained control over the SSE protocol or a non-React frontend.