AI Testing Guide: Unit Tests, Integration Tests, Evals, and Mocking LLMs

Testing AI applications is different from testing traditional software. LLM outputs are probabilistic, which means you need a layered testing strategy: deterministic unit tests around your application logic, mocked LLM responses for speed, and statistical evals for quality.

Testing pyramid for AI applications

// Layer 1: Unit tests (fast, deterministic, no LLM calls)
//   - Test prompt construction
//   - Test output parsing and validation
//   - Test tool logic (separate from LLM)
//   - Test retry/fallback logic

// Layer 2: Integration tests (mocked LLM, test request/response flow)
//   - Test end-to-end flow with known LLM responses
//   - Test error handling for API failures

// Layer 3: Evals (real LLM calls, quality metrics)
//   - Test output quality on representative dataset
//   - Track quality over time (regression detection)
//   - Run on CI for prompt changes

Unit test: prompt construction

// prompts/support.ts
export function buildSupportPrompt(userMessage: string, customerTier: 'free' | 'pro'): Message[] {
  const tierContext = customerTier === 'pro'
    ? 'This is a Pro customer — prioritize their issue and offer expedited solutions.'
    : 'This is a Free customer — follow standard support procedures.';

  return [
    { role: 'system', content: `You are a helpful support agent. ${tierContext}` },
    { role: 'user', content: userMessage },
  ];
}

// prompts/support.test.ts
import { describe, it, expect } from 'vitest';
import { buildSupportPrompt } from './support';

describe('buildSupportPrompt', () => {
  it('includes Pro context for pro customers', () => {
    const messages = buildSupportPrompt('My payment failed', 'pro');
    expect(messages[0].content).toContain('Pro customer');
    expect(messages[0].content).toContain('expedited');
  });

  it('includes standard context for free customers', () => {
    const messages = buildSupportPrompt('My payment failed', 'free');
    expect(messages[0].content).toContain('standard support');
  });

  it('preserves user message verbatim', () => {
    const msg = 'How do I export my data?';
    const messages = buildSupportPrompt(msg, 'free');
    expect(messages[1].content).toBe(msg);
  });

  it('always returns exactly 2 messages', () => {
    const messages = buildSupportPrompt('test', 'pro');
    expect(messages).toHaveLength(2);
  });
});

Unit test: output parsing

// parsers/sentiment.ts
import { z } from 'zod';

export const SentimentSchema = z.object({
  sentiment:  z.enum(['positive', 'negative', 'neutral']),
  confidence: z.number().min(0).max(1),
  summary:    z.string().min(1),
});
export type Sentiment = z.infer<typeof SentimentSchema>;

export function parseSentiment(raw: string): Sentiment {
  return SentimentSchema.parse(JSON.parse(raw));
}

// parsers/sentiment.test.ts
describe('parseSentiment', () => {
  it('parses valid sentiment JSON', () => {
    const result = parseSentiment('{"sentiment":"positive","confidence":0.9,"summary":"Great product"}');
    expect(result.sentiment).toBe('positive');
    expect(result.confidence).toBe(0.9);
  });

  it('throws on invalid sentiment value', () => {
    expect(() =>
      parseSentiment('{"sentiment":"amazing","confidence":0.9,"summary":"x"}')
    ).toThrow();
  });

  it('throws on out-of-range confidence', () => {
    expect(() =>
      parseSentiment('{"sentiment":"positive","confidence":1.5,"summary":"x"}')
    ).toThrow();
  });

  it('throws on malformed JSON', () => {
    expect(() => parseSentiment('not json')).toThrow();
  });
});

Mocking the OpenAI client in tests

// __mocks__/openai.ts (Vitest/Jest auto-mock)
import { vi } from 'vitest';

const mockCreate = vi.fn();

const OpenAI = vi.fn().mockImplementation(() => ({
  chat: { completions: { create: mockCreate } },
  embeddings: { create: vi.fn() },
}));

export default OpenAI;
export { mockCreate };

// In your test file
import { describe, it, expect, beforeEach } from 'vitest';
import { mockCreate } from '../__mocks__/openai';
import { classifySentiment } from './classifier';

describe('classifySentiment', () => {
  beforeEach(() => mockCreate.mockClear());

  it('returns positive for happy messages', async () => {
    mockCreate.mockResolvedValueOnce({
      choices: [{
        message: { content: '{"sentiment":"positive","confidence":0.95,"summary":"User is happy"}' },
        finish_reason: 'stop',
      }],
      usage: { prompt_tokens: 50, completion_tokens: 20 },
    });

    const result = await classifySentiment('I love this product!');
    expect(result.sentiment).toBe('positive');
    expect(mockCreate).toHaveBeenCalledOnce();
  });

  it('handles API errors gracefully', async () => {
    mockCreate.mockRejectedValueOnce(new Error('Rate limit exceeded'));
    await expect(classifySentiment('test')).rejects.toThrow('Rate limit exceeded');
  });
});

MSW for HTTP-level mocking

// For integration tests that go through the HTTP layer
import { setupServer } from 'msw/node';
import { http, HttpResponse } from 'msw';

export const server = setupServer(
  http.post('https://api.openai.com/v1/chat/completions', () => {
    return HttpResponse.json({
      id: 'chatcmpl-mock',
      choices: [{
        message: { role: 'assistant', content: '{"sentiment":"positive","confidence":0.9,"summary":"Good"}' },
        finish_reason: 'stop',
        index: 0,
      }],
      usage: { prompt_tokens: 50, completion_tokens: 20, total_tokens: 70 },
    });
  })
);

beforeAll(() => server.listen());
afterEach(() => server.resetHandlers());
afterAll(() => server.close());

Eval framework

// evals/sentiment-eval.ts
interface EvalCase {
  input: string;
  expectedSentiment: 'positive' | 'negative' | 'neutral';
  minConfidence?: number;
}

const testCases: EvalCase[] = [
  { input: 'I absolutely love this feature!', expectedSentiment: 'positive', minConfidence: 0.8 },
  { input: 'This is broken and useless.', expectedSentiment: 'negative', minConfidence: 0.8 },
  { input: 'The package arrived.', expectedSentiment: 'neutral' },
  { input: 'Not bad, not great either.', expectedSentiment: 'neutral' },
  { input: 'Terrible experience, would not recommend.', expectedSentiment: 'negative', minConfidence: 0.9 },
];

async function runEval() {
  let correct = 0;
  const failures: string[] = [];

  for (const tc of testCases) {
    const result = await classifySentiment(tc.input);

    const sentimentOk = result.sentiment === tc.expectedSentiment;
    const confidenceOk = tc.minConfidence ? result.confidence >= tc.minConfidence : true;

    if (sentimentOk && confidenceOk) {
      correct++;
    } else {
      failures.push(
        `FAIL: "${tc.input}" → got ${result.sentiment}(${result.confidence.toFixed(2)}), expected ${tc.expectedSentiment}`
      );
    }
  }

  const accuracy = correct / testCases.length;
  console.log(`Accuracy: ${(accuracy * 100).toFixed(1)}% (${correct}/${testCases.length})`);
  failures.forEach(f => console.error(f));

  if (accuracy < 0.9) {
    process.exit(1);  // fail CI if below 90% accuracy
  }
}

runEval();

Snapshot testing for prompts

// Catch unexpected prompt changes that could affect quality
import { describe, it, expect } from 'vitest';

describe('prompt snapshots', () => {
  it('support prompt matches snapshot', () => {
    const messages = buildSupportPrompt('test message', 'pro');
    expect(messages).toMatchSnapshot();
    // Snapshot stored in __snapshots__/support.test.ts.snap
    // Any change requires explicit snapshot update: vitest --update-snapshots
  });
});

CI integration

# .github/workflows/ai-tests.yml
name: AI Tests

on: [pull_request]

jobs:
  unit-tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - run: npm ci
      - run: npm test  # unit + integration (mocked, fast)

  evals:
    runs-on: ubuntu-latest
    # Only run evals when prompts or AI logic changes
    if: |
      contains(github.event.pull_request.changed_files, 'prompts/') ||
      contains(github.event.pull_request.changed_files, 'lib/ai/')
    env:
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
    steps:
      - uses: actions/checkout@v4
      - run: npm ci
      - run: npm run eval  # real LLM calls, quality checks

Takeaway

Separate concerns: unit-test prompt construction and output parsing without LLM calls. Mock the API client for integration tests to keep CI fast. Run real-LLM evals only when prompt files change. This strategy keeps your test suite fast while still catching quality regressions.