The Challenge: Testing Non-Deterministic Systems
Unlike regular code tests, LLMs return different outputs for identical inputs every time. You can't use simple "does it match expected value?" assertions. This is the fundamental difficulty with LLM integration testing.
But systems you can't test can't be trusted. LLM-powered systems need their own testing strategy.
Test categories:
- Business logic: Code independent of prompts (normal unit tests)
- Prompt integration: LLM interface (mocks/stubs)
- LLM behavior quality: Output quality and safety (evaluation frameworks)
- End-to-end: Full system behavior (integration tests with real LLM)
LLM Mock Strategy
from unittest.mock import AsyncMock, patch
import pytest
from anthropic.types import Message, TextBlock, Usage
def create_mock_message(text: str) -> Message:
return Message(
id="msg_test_123",
type="message",
role="assistant",
content=[TextBlock(type="text", text=text)],
model="claude-opus-4-5",
stop_reason="end_turn",
stop_sequence=None,
usage=Usage(input_tokens=100, output_tokens=50),
)
@pytest.fixture
def mock_anthropic():
with patch("anthropic.AsyncAnthropic") as MockClient:
instance = AsyncMock()
MockClient.return_value = instance
instance.messages.create.return_value = create_mock_message("Mock response")
yield instance
@pytest.mark.asyncio
async def test_summarize_calls_correct_model(mock_anthropic):
await summarize_text("Long text...")
call_kwargs = mock_anthropic.messages.create.call_args.kwargs
assert call_kwargs["model"] == "claude-opus-4-5"
Scenario-Based Tests
async def run_scenario_test(scenario: dict, llm_fn) -> dict:
output = await llm_fn(scenario["input"])
result = {"name": scenario["name"], "passed": True, "failures": []}
if "expected_label" in scenario:
if output.get("label") != scenario["expected_label"]:
result["passed"] = False
result["failures"].append(f"Label: expected {scenario['expected_label']}, got {output.get('label')}")
if "min_confidence" in scenario:
conf = output.get("confidence", 0)
if conf < scenario["min_confidence"]:
result["passed"] = False
result["failures"].append(f"Confidence too low: {conf} < {scenario['min_confidence']}")
return result
Prompt Regression Testing
import hashlib
import json
from pathlib import Path
class PromptRegressionTracker:
def __init__(self, baseline_dir: Path):
self.baseline_dir = baseline_dir
self.baseline_dir.mkdir(parents=True, exist_ok=True)
def _prompt_hash(self, prompt: str) -> str:
return hashlib.sha256(prompt.encode()).hexdigest()[:8]
def save_baseline(self, prompt: str, outputs: list[str]) -> str:
prompt_id = self._prompt_hash(prompt)
baseline = {"prompt_hash": prompt_id, "outputs": outputs}
path = self.baseline_dir / f"{prompt_id}.json"
path.write_text(json.dumps(baseline, ensure_ascii=False, indent=2))
return prompt_id
def compare_to_baseline(self, prompt: str, new_outputs: list[str], evaluator) -> dict:
prompt_id = self._prompt_hash(prompt)
path = self.baseline_dir / f"{prompt_id}.json"
if not path.exists():
return {"status": "no_baseline"}
baseline = json.loads(path.read_text())
scores = [evaluator(new, old) for new, old in zip(new_outputs, baseline["outputs"])]
avg_score = sum(scores) / len(scores) if scores else 0
return {
"status": "regression" if avg_score < 0.7 else "ok",
"similarity_score": avg_score,
}
LLM-as-Judge Evaluation
from anthropic import AsyncAnthropic
class LLMEvaluator:
def __init__(self):
self.client = AsyncAnthropic()
async def evaluate_factuality(self, question: str, answer: str, reference: str) -> dict:
prompt = f"""Evaluate the following question, answer, and reference information.
Question: {question}
Answer: {answer}
Reference: {reference}
Rate 1-5 on each criterion and return as JSON:
- factuality: accuracy of facts (does it match reference?)
- relevance: relevance to the question
- completeness: completeness of answer
Format: {{"factuality": N, "relevance": N, "completeness": N, "reason": "..."}}"""
response = await self.client.messages.create(
model="claude-haiku-4-5", # Use cheap model for evaluation
max_tokens=256,
messages=[{"role": "user", "content": prompt}],
)
return json.loads(response.content[0].text)
CI/CD Integration
name: LLM Integration Tests
on:
push:
paths: ['src/prompts/**', 'src/llm/**']
jobs:
llm-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run unit tests (mocked LLM)
run: pytest tests/unit/ -v
env:
ANTHROPIC_API_KEY: "dummy"
- name: Run prompt scenario tests
run: pytest tests/scenarios/ -v --timeout=60
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
LLM testing won't be perfect, but building continuous quality monitoring systems lets you catch regressions from prompt changes or model updates early.
This article is from the Claude Code Complete Guide (7 chapters) on note.com.
myouga (@myougatheaxo) - VTuber axolotl. Sharing practical AI development tips.
Top comments (0)