LLM Testing Strategy: Mocks, Evaluation, and Regression Testing for AI Systems

#python #llm #testing #ai

The Challenge: Testing Non-Deterministic Systems

Unlike regular code tests, LLMs return different outputs for identical inputs every time. You can't use simple "does it match expected value?" assertions. This is the fundamental difficulty with LLM integration testing.

But systems you can't test can't be trusted. LLM-powered systems need their own testing strategy.

Test categories:

Business logic: Code independent of prompts (normal unit tests)
Prompt integration: LLM interface (mocks/stubs)
LLM behavior quality: Output quality and safety (evaluation frameworks)
End-to-end: Full system behavior (integration tests with real LLM)

LLM Mock Strategy

from unittest.mock import AsyncMock, patch
import pytest
from anthropic.types import Message, TextBlock, Usage

def create_mock_message(text: str) -> Message:
    return Message(
        id="msg_test_123",
        type="message",
        role="assistant",
        content=[TextBlock(type="text", text=text)],
        model="claude-opus-4-5",
        stop_reason="end_turn",
        stop_sequence=None,
        usage=Usage(input_tokens=100, output_tokens=50),
    )

@pytest.fixture
def mock_anthropic():
    with patch("anthropic.AsyncAnthropic") as MockClient:
        instance = AsyncMock()
        MockClient.return_value = instance
        instance.messages.create.return_value = create_mock_message("Mock response")
        yield instance

@pytest.mark.asyncio
async def test_summarize_calls_correct_model(mock_anthropic):
    await summarize_text("Long text...")
    call_kwargs = mock_anthropic.messages.create.call_args.kwargs
    assert call_kwargs["model"] == "claude-opus-4-5"

Scenario-Based Tests

async def run_scenario_test(scenario: dict, llm_fn) -> dict:
    output = await llm_fn(scenario["input"])
    result = {"name": scenario["name"], "passed": True, "failures": []}

    if "expected_label" in scenario:
        if output.get("label") != scenario["expected_label"]:
            result["passed"] = False
            result["failures"].append(f"Label: expected {scenario['expected_label']}, got {output.get('label')}")

    if "min_confidence" in scenario:
        conf = output.get("confidence", 0)
        if conf < scenario["min_confidence"]:
            result["passed"] = False
            result["failures"].append(f"Confidence too low: {conf} < {scenario['min_confidence']}")

    return result

Prompt Regression Testing

import hashlib
import json
from pathlib import Path

class PromptRegressionTracker:
    def __init__(self, baseline_dir: Path):
        self.baseline_dir = baseline_dir
        self.baseline_dir.mkdir(parents=True, exist_ok=True)

    def _prompt_hash(self, prompt: str) -> str:
        return hashlib.sha256(prompt.encode()).hexdigest()[:8]

    def save_baseline(self, prompt: str, outputs: list[str]) -> str:
        prompt_id = self._prompt_hash(prompt)
        baseline = {"prompt_hash": prompt_id, "outputs": outputs}
        path = self.baseline_dir / f"{prompt_id}.json"
        path.write_text(json.dumps(baseline, ensure_ascii=False, indent=2))
        return prompt_id

    def compare_to_baseline(self, prompt: str, new_outputs: list[str], evaluator) -> dict:
        prompt_id = self._prompt_hash(prompt)
        path = self.baseline_dir / f"{prompt_id}.json"

        if not path.exists():
            return {"status": "no_baseline"}

        baseline = json.loads(path.read_text())
        scores = [evaluator(new, old) for new, old in zip(new_outputs, baseline["outputs"])]
        avg_score = sum(scores) / len(scores) if scores else 0

        return {
            "status": "regression" if avg_score < 0.7 else "ok",
            "similarity_score": avg_score,
        }

LLM-as-Judge Evaluation

from anthropic import AsyncAnthropic

class LLMEvaluator:
    def __init__(self):
        self.client = AsyncAnthropic()

    async def evaluate_factuality(self, question: str, answer: str, reference: str) -> dict:
        prompt = f"""Evaluate the following question, answer, and reference information.

Question: {question}
Answer: {answer}
Reference: {reference}

Rate 1-5 on each criterion and return as JSON:
- factuality: accuracy of facts (does it match reference?)
- relevance: relevance to the question
- completeness: completeness of answer

Format: {{"factuality": N, "relevance": N, "completeness": N, "reason": "..."}}"""

        response = await self.client.messages.create(
            model="claude-haiku-4-5",  # Use cheap model for evaluation
            max_tokens=256,
            messages=[{"role": "user", "content": prompt}],
        )
        return json.loads(response.content[0].text)

CI/CD Integration

name: LLM Integration Tests
on:
  push:
    paths: ['src/prompts/**', 'src/llm/**']

jobs:
  llm-tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Run unit tests (mocked LLM)
        run: pytest tests/unit/ -v
        env:
          ANTHROPIC_API_KEY: "dummy"
      - name: Run prompt scenario tests
        run: pytest tests/scenarios/ -v --timeout=60
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

LLM testing won't be perfect, but building continuous quality monitoring systems lets you catch regressions from prompt changes or model updates early.

This article is from the Claude Code Complete Guide (7 chapters) on note.com.
myouga (@myougatheaxo) - VTuber axolotl. Sharing practical AI development tips.