Originally published at claudeguide.io/claude-agent-testing-eval
Testing and Evaluating Claude Agents: A Production Guide
Most Claude agents ship without any automated tests — and most teams regret it after a prompt change silently breaks a production workflow. A complete agent testing strategy has three layers: unit tests for tool call logic, integration tests for multi-turn conversation flows, and an eval harness that measures output quality on a fixed dataset before every deploy in 2026. This guide covers the full testing stack for production Claude agents.
Why Agent Testing Is Different
Standard software testing verifies deterministic behavior: input A always produces output B. Agent testing has a different challenge: LLM outputs are probabilistic. You can't assert exact string equality — you need to assert properties of the output.
The testing hierarchy for agents:
- Unit tests: Test your tool implementations independently (deterministic, easy)
- Integration tests: Test the full agent loop with mocked or real API calls (semi-deterministic)
- Eval harness: Measure output quality on a representative dataset (probabilistic, scored)
- Regression tests: Run the eval before every deploy, alert on quality drops (ongoing)
Layer 1: Unit Testing Tool Implementations
Tool implementations are regular functions — test them like any other code.
import pytest
from unittest.mock import patch, MagicMock
from your_agent.tools import search_database, format_invoice, validate_input
class TestSearchDatabaseTool:
"""Test the tool implementation independently of Claude."""
def test_returns_results_for_valid_query(self):
with patch("your_agent.tools.db") as mock_db:
mock_db.execute.return_value = [
{"id": 1, "name": "Test User", "email": "test@example.com"}
]
result = search_database(query="test", limit=10)
assert len(result) == 1
assert result[0]["name"] == "Test User"
def test_returns_empty_list_for_no_results(self):
with patch("your_agent.tools.db") as mock_db:
mock_db.execute.return_value = []
result = search_database(query="nonexistent", limit=10)
assert result == []
def test_raises_on_invalid_limit(self):
with pytest.raises(ValueError, match="limit must be positive"):
search_database(query="test", limit=-1)
def test_sanitizes_sql_injection_attempt(self):
"""Tool should handle malicious input gracefully."""
result = search_database(query="'; DROP TABLE users; --", limit=10)
# Should not raise, should return empty or sanitized results
assert isinstance(result, list)
class TestFormatInvoiceTool:
def test_formats_standard_invoice(self):
invoice_data = {
"vendor": "Acme Corp",
"amount": 1500.00,
"date": "2026-04-28",
"items": [{"description": "Consulting", "qty": 10, "price": 150.0}]
}
result = format_invoice(invoice_data)
assert "Acme Corp" in result
assert "$1,500.00" in result or "1500" in result
def test_handles_missing_optional_fields(self):
minimal_invoice = {"vendor": "Test", "amount": 100.0, "date": "2026-04-28"}
# Should not raise
result = format_invoice(minimal_invoice)
assert result is not None
Layer 2: Integration Tests for the Agent Loop
Integration tests verify that the agent orchestrates tools correctly across a multi-turn conversation. Use recorded responses or a mock client to make tests deterministic.
Approach A: Mock the Anthropic client
import anthropic
from unittest.mock import MagicMock, patch
from your_agent.agent import run_agent
def make_mock_response(text=None, tool_name=None, tool_input=None, stop_reason="end_turn"):
"""Build a mock anthropic.Message object."""
response = MagicMock()
response.stop_reason = stop_reason
response.usage = MagicMock(input_tokens=100, output_tokens=50)
if tool_name:
tool_block = MagicMock()
tool_block.type = "tool_use"
tool_block.name = tool_name
tool_block.id = "tool_abc123"
tool_block.input = tool_input or {}
response.content = [tool_block]
else:
text_block = MagicMock()
text_block.type = "text"
text_block.text = text or "Done."
response.content = [text_block]
return response
class TestAgentOrchestration:
@patch("your_agent.agent.anthropic.Anthropic")
def test_agent_calls_search_tool_when_asked(self, mock_anthropic_class):
"""Agent should call search_database tool for search requests."""
mock_client = MagicMock()
mock_anthropic_class.return_value = mock_client
# First call: Claude decides to use search tool
# Second call: Claude synthesizes the result
mock_client.messages.create.side_effect = [
make_mock_response(
tool_name="search_database",
tool_input={"query": "active users", "limit": 10},
stop_reason="tool_use"
),
make_mock_response(text="I found 3 active users matching your query."),
]
with patch("your_agent.agent.search_database") as mock_search:
mock_search.return_value = [
{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}, {"id": 3, "name": "Carol"}
]
result = run_agent("Find active users")
# Verify search was called
mock_search.assert_called_once_with(query="active users", limit=10)
assert "found" in result.lower() or "3" in result
@patch("your_agent.agent.anthropic.Anthropic")
def test_agent_handles_tool_error_gracefully(self, mock_anthropic_class):
"""Agent should recover when a tool raises an exception."""
mock_client = MagicMock()
mock_anthropic_class.return_value = mock_client
mock_client.messages.create.side_effect = [
make_mock_response(
tool_name="search_database",
tool_input={"query": "test"},
stop_reason="tool_use"
),
make_mock_response(text="I wasn't able to search the database. Please try again."),
]
with patch("your_agent.agent.search_database") as mock_search:
mock_search.side_effect = Exception("Database connection failed")
result = run_agent("Search for test")
# Agent should respond gracefully, not crash
assert result is not None
assert isinstance(result, str)
@patch("your_agent.agent.anthropic.Anthropic")
def test_agent_stops_before_turn_limit(self, mock_anthropic_class):
"""Agent should not loop indefinitely."""
mock_client = MagicMock()
mock_anthropic_class.return_value = mock_client
# Return tool_use indefinitely
mock_client.messages.create.return_value = make_mock_response(
tool_name="search_database",
tool_input={"query": "loop"},
stop_reason="tool_use"
)
with patch("your_agent.agent.search_database", return_value=[]):
result = run_agent("Keep searching", max_turns=5)
# Should stop at max_turns, not loop forever
assert mock_client.messages.create.call_count <= 5
Approach B: Record and replay real API responses
python
import json
from pathlib import Path
class RecordedAnthropicClient:
"""Client that records real API calls and can replay them."""
def __init__(self, record_path: str, mode: str = "replay"):
self.record_path = Path(record_path)
self.mode = mode # "record" or "replay"
self._calls = []
self._index = 0
if mode == "replay" and self.record_path.exists():
self._calls = json.loads(self.record_path.read_text())
def messages_create(self, **kwargs):
if self.mode == "record":
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(**kwargs)
self._calls.append(response.model_dump())
return response
else:
if self._index
[→ Get the Agent SDK Cookbook — $49](https://shoutfirst.gumroad.com/l/ogxhmy?utm_source=claudeguide&utm_medium=article&utm_campaign=claude-agent-testing-eval)
*30-day money-back guarantee. Instant download.*
Top comments (0)