AI Testing Guide: Unit Tests, Integration Tests, Evals, and Mocking LLMs
·12 min read
Testing AI applications is different from testing traditional software. LLM outputs are probabilistic, which means you need a layered testing strategy: deterministic unit tests around your application logic, mocked LLM responses for speed, and statistical evals for quality.
Testing pyramid for AI applications
// Layer 1: Unit tests (fast, deterministic, no LLM calls) // - Test prompt construction // - Test output parsing and validation // - Test tool logic (separate from LLM) // - Test retry/fallback logic // Layer 2: Integration tests (mocked LLM, test request/response flow) // - Test end-to-end flow with known LLM responses // - Test error handling for API failures // Layer 3: Evals (real LLM calls, quality metrics) // - Test output quality on representative dataset // - Track quality over time (regression detection) // - Run on CI for prompt changes
Unit test: prompt construction
// prompts/support.ts
export function buildSupportPrompt(userMessage: string, customerTier: 'free' | 'pro'): Message[] {
const tierContext = customerTier === 'pro'
? 'This is a Pro customer — prioritize their issue and offer expedited solutions.'
: 'This is a Free customer — follow standard support procedures.';
return [
{ role: 'system', content: `You are a helpful support agent. ${tierContext}` },
{ role: 'user', content: userMessage },
];
}
// prompts/support.test.ts
import { describe, it, expect } from 'vitest';
import { buildSupportPrompt } from './support';
describe('buildSupportPrompt', () => {
it('includes Pro context for pro customers', () => {
const messages = buildSupportPrompt('My payment failed', 'pro');
expect(messages[0].content).toContain('Pro customer');
expect(messages[0].content).toContain('expedited');
});
it('includes standard context for free customers', () => {
const messages = buildSupportPrompt('My payment failed', 'free');
expect(messages[0].content).toContain('standard support');
});
it('preserves user message verbatim', () => {
const msg = 'How do I export my data?';
const messages = buildSupportPrompt(msg, 'free');
expect(messages[1].content).toBe(msg);
});
it('always returns exactly 2 messages', () => {
const messages = buildSupportPrompt('test', 'pro');
expect(messages).toHaveLength(2);
});
});Unit test: output parsing
// parsers/sentiment.ts
import { z } from 'zod';
export const SentimentSchema = z.object({
sentiment: z.enum(['positive', 'negative', 'neutral']),
confidence: z.number().min(0).max(1),
summary: z.string().min(1),
});
export type Sentiment = z.infer<typeof SentimentSchema>;
export function parseSentiment(raw: string): Sentiment {
return SentimentSchema.parse(JSON.parse(raw));
}
// parsers/sentiment.test.ts
describe('parseSentiment', () => {
it('parses valid sentiment JSON', () => {
const result = parseSentiment('{"sentiment":"positive","confidence":0.9,"summary":"Great product"}');
expect(result.sentiment).toBe('positive');
expect(result.confidence).toBe(0.9);
});
it('throws on invalid sentiment value', () => {
expect(() =>
parseSentiment('{"sentiment":"amazing","confidence":0.9,"summary":"x"}')
).toThrow();
});
it('throws on out-of-range confidence', () => {
expect(() =>
parseSentiment('{"sentiment":"positive","confidence":1.5,"summary":"x"}')
).toThrow();
});
it('throws on malformed JSON', () => {
expect(() => parseSentiment('not json')).toThrow();
});
});Mocking the OpenAI client in tests
// __mocks__/openai.ts (Vitest/Jest auto-mock)
import { vi } from 'vitest';
const mockCreate = vi.fn();
const OpenAI = vi.fn().mockImplementation(() => ({
chat: { completions: { create: mockCreate } },
embeddings: { create: vi.fn() },
}));
export default OpenAI;
export { mockCreate };
// In your test file
import { describe, it, expect, beforeEach } from 'vitest';
import { mockCreate } from '../__mocks__/openai';
import { classifySentiment } from './classifier';
describe('classifySentiment', () => {
beforeEach(() => mockCreate.mockClear());
it('returns positive for happy messages', async () => {
mockCreate.mockResolvedValueOnce({
choices: [{
message: { content: '{"sentiment":"positive","confidence":0.95,"summary":"User is happy"}' },
finish_reason: 'stop',
}],
usage: { prompt_tokens: 50, completion_tokens: 20 },
});
const result = await classifySentiment('I love this product!');
expect(result.sentiment).toBe('positive');
expect(mockCreate).toHaveBeenCalledOnce();
});
it('handles API errors gracefully', async () => {
mockCreate.mockRejectedValueOnce(new Error('Rate limit exceeded'));
await expect(classifySentiment('test')).rejects.toThrow('Rate limit exceeded');
});
});MSW for HTTP-level mocking
// For integration tests that go through the HTTP layer
import { setupServer } from 'msw/node';
import { http, HttpResponse } from 'msw';
export const server = setupServer(
http.post('https://api.openai.com/v1/chat/completions', () => {
return HttpResponse.json({
id: 'chatcmpl-mock',
choices: [{
message: { role: 'assistant', content: '{"sentiment":"positive","confidence":0.9,"summary":"Good"}' },
finish_reason: 'stop',
index: 0,
}],
usage: { prompt_tokens: 50, completion_tokens: 20, total_tokens: 70 },
});
})
);
beforeAll(() => server.listen());
afterEach(() => server.resetHandlers());
afterAll(() => server.close());Eval framework
// evals/sentiment-eval.ts
interface EvalCase {
input: string;
expectedSentiment: 'positive' | 'negative' | 'neutral';
minConfidence?: number;
}
const testCases: EvalCase[] = [
{ input: 'I absolutely love this feature!', expectedSentiment: 'positive', minConfidence: 0.8 },
{ input: 'This is broken and useless.', expectedSentiment: 'negative', minConfidence: 0.8 },
{ input: 'The package arrived.', expectedSentiment: 'neutral' },
{ input: 'Not bad, not great either.', expectedSentiment: 'neutral' },
{ input: 'Terrible experience, would not recommend.', expectedSentiment: 'negative', minConfidence: 0.9 },
];
async function runEval() {
let correct = 0;
const failures: string[] = [];
for (const tc of testCases) {
const result = await classifySentiment(tc.input);
const sentimentOk = result.sentiment === tc.expectedSentiment;
const confidenceOk = tc.minConfidence ? result.confidence >= tc.minConfidence : true;
if (sentimentOk && confidenceOk) {
correct++;
} else {
failures.push(
`FAIL: "${tc.input}" → got ${result.sentiment}(${result.confidence.toFixed(2)}), expected ${tc.expectedSentiment}`
);
}
}
const accuracy = correct / testCases.length;
console.log(`Accuracy: ${(accuracy * 100).toFixed(1)}% (${correct}/${testCases.length})`);
failures.forEach(f => console.error(f));
if (accuracy < 0.9) {
process.exit(1); // fail CI if below 90% accuracy
}
}
runEval();Snapshot testing for prompts
// Catch unexpected prompt changes that could affect quality
import { describe, it, expect } from 'vitest';
describe('prompt snapshots', () => {
it('support prompt matches snapshot', () => {
const messages = buildSupportPrompt('test message', 'pro');
expect(messages).toMatchSnapshot();
// Snapshot stored in __snapshots__/support.test.ts.snap
// Any change requires explicit snapshot update: vitest --update-snapshots
});
});CI integration
# .github/workflows/ai-tests.yml
name: AI Tests
on: [pull_request]
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: npm ci
- run: npm test # unit + integration (mocked, fast)
evals:
runs-on: ubuntu-latest
# Only run evals when prompts or AI logic changes
if: |
contains(github.event.pull_request.changed_files, 'prompts/') ||
contains(github.event.pull_request.changed_files, 'lib/ai/')
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
steps:
- uses: actions/checkout@v4
- run: npm ci
- run: npm run eval # real LLM calls, quality checksTakeaway
Separate concerns: unit-test prompt construction and output parsing without LLM calls. Mock the API client for integration tests to keep CI fast. Run real-LLM evals only when prompt files change. This strategy keeps your test suite fast while still catching quality regressions.