Testing and Evaluating Claude Agents: A Production Guide

#testing #evaluation #evals #production

Originally published at claudeguide.io/claude-agent-testing-eval

Testing and Evaluating Claude Agents: A Production Guide

Most Claude agents ship without any automated tests — and most teams regret it after a prompt change silently breaks a production workflow. A complete agent testing strategy has three layers: unit tests for tool call logic, integration tests for multi-turn conversation flows, and an eval harness that measures output quality on a fixed dataset before every deploy in 2026. This guide covers the full testing stack for production Claude agents.

Why Agent Testing Is Different

Standard software testing verifies deterministic behavior: input A always produces output B. Agent testing has a different challenge: LLM outputs are probabilistic. You can't assert exact string equality — you need to assert properties of the output.

The testing hierarchy for agents:

Unit tests: Test your tool implementations independently (deterministic, easy)
Integration tests: Test the full agent loop with mocked or real API calls (semi-deterministic)
Eval harness: Measure output quality on a representative dataset (probabilistic, scored)
Regression tests: Run the eval before every deploy, alert on quality drops (ongoing)

Layer 1: Unit Testing Tool Implementations

Tool implementations are regular functions — test them like any other code.

import pytest
from unittest.mock import patch, MagicMock
from your_agent.tools import search_database, format_invoice, validate_input


class TestSearchDatabaseTool:
    """Test the tool implementation independently of Claude."""

    def test_returns_results_for_valid_query(self):
        with patch("your_agent.tools.db") as mock_db:
            mock_db.execute.return_value = [
                {"id": 1, "name": "Test User", "email": "test@example.com"}
            ]
            result = search_database(query="test", limit=10)

        assert len(result) == 1
        assert result[0]["name"] == "Test User"

    def test_returns_empty_list_for_no_results(self):
        with patch("your_agent.tools.db") as mock_db:
            mock_db.execute.return_value = []
            result = search_database(query="nonexistent", limit=10)

        assert result == []

    def test_raises_on_invalid_limit(self):
        with pytest.raises(ValueError, match="limit must be positive"):
            search_database(query="test", limit=-1)

    def test_sanitizes_sql_injection_attempt(self):
        """Tool should handle malicious input gracefully."""
        result = search_database(query="'; DROP TABLE users; --", limit=10)
        # Should not raise, should return empty or sanitized results
        assert isinstance(result, list)


class TestFormatInvoiceTool:
    def test_formats_standard_invoice(self):
        invoice_data = {
            "vendor": "Acme Corp",
            "amount": 1500.00,
            "date": "2026-04-28",
            "items": [{"description": "Consulting", "qty": 10, "price": 150.0}]
        }
        result = format_invoice(invoice_data)

        assert "Acme Corp" in result
        assert "$1,500.00" in result or "1500" in result

    def test_handles_missing_optional_fields(self):
        minimal_invoice = {"vendor": "Test", "amount": 100.0, "date": "2026-04-28"}
        # Should not raise
        result = format_invoice(minimal_invoice)
        assert result is not None

Layer 2: Integration Tests for the Agent Loop

Integration tests verify that the agent orchestrates tools correctly across a multi-turn conversation. Use recorded responses or a mock client to make tests deterministic.

Approach A: Mock the Anthropic client

import anthropic
from unittest.mock import MagicMock, patch
from your_agent.agent import run_agent


def make_mock_response(text=None, tool_name=None, tool_input=None, stop_reason="end_turn"):
    """Build a mock anthropic.Message object."""
    response = MagicMock()
    response.stop_reason = stop_reason
    response.usage = MagicMock(input_tokens=100, output_tokens=50)

    if tool_name:
        tool_block = MagicMock()
        tool_block.type = "tool_use"
        tool_block.name = tool_name
        tool_block.id = "tool_abc123"
        tool_block.input = tool_input or {}
        response.content = [tool_block]
    else:
        text_block = MagicMock()
        text_block.type = "text"
        text_block.text = text or "Done."
        response.content = [text_block]

    return response


class TestAgentOrchestration:

    @patch("your_agent.agent.anthropic.Anthropic")
    def test_agent_calls_search_tool_when_asked(self, mock_anthropic_class):
        """Agent should call search_database tool for search requests."""
        mock_client = MagicMock()
        mock_anthropic_class.return_value = mock_client

        # First call: Claude decides to use search tool
        # Second call: Claude synthesizes the result
        mock_client.messages.create.side_effect = [
            make_mock_response(
                tool_name="search_database",
                tool_input={"query": "active users", "limit": 10},
                stop_reason="tool_use"
            ),
            make_mock_response(text="I found 3 active users matching your query."),
        ]

        with patch("your_agent.agent.search_database") as mock_search:
            mock_search.return_value = [
                {"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}, {"id": 3, "name": "Carol"}
            ]
            result = run_agent("Find active users")

        # Verify search was called
        mock_search.assert_called_once_with(query="active users", limit=10)
        assert "found" in result.lower() or "3" in result

    @patch("your_agent.agent.anthropic.Anthropic")
    def test_agent_handles_tool_error_gracefully(self, mock_anthropic_class):
        """Agent should recover when a tool raises an exception."""
        mock_client = MagicMock()
        mock_anthropic_class.return_value = mock_client

        mock_client.messages.create.side_effect = [
            make_mock_response(
                tool_name="search_database",
                tool_input={"query": "test"},
                stop_reason="tool_use"
            ),
            make_mock_response(text="I wasn't able to search the database. Please try again."),
        ]

        with patch("your_agent.agent.search_database") as mock_search:
            mock_search.side_effect = Exception("Database connection failed")
            result = run_agent("Search for test")

        # Agent should respond gracefully, not crash
        assert result is not None
        assert isinstance(result, str)

    @patch("your_agent.agent.anthropic.Anthropic")
    def test_agent_stops_before_turn_limit(self, mock_anthropic_class):
        """Agent should not loop indefinitely."""
        mock_client = MagicMock()
        mock_anthropic_class.return_value = mock_client

        # Return tool_use indefinitely
        mock_client.messages.create.return_value = make_mock_response(
            tool_name="search_database",
            tool_input={"query": "loop"},
            stop_reason="tool_use"
        )

        with patch("your_agent.agent.search_database", return_value=[]):
            result = run_agent("Keep searching", max_turns=5)

        # Should stop at max_turns, not loop forever
        assert mock_client.messages.create.call_count <= 5

Approach B: Record and replay real API responses


python
import json
from pathlib import Path


class RecordedAnthropicClient:
    """Client that records real API calls and can replay them."""

    def __init__(self, record_path: str, mode: str = "replay"):
        self.record_path = Path(record_path)
        self.mode = mode  # "record" or "replay"
        self._calls = []
        self._index = 0

        if mode == "replay" and self.record_path.exists():
            self._calls = json.loads(self.record_path.read_text())

    def messages_create(self, **kwargs):
        if self.mode == "record":
            import anthropic
            client = anthropic.Anthropic()
            response = client.messages.create(**kwargs)
            self._calls.append(response.model_dump())
            return response
        else:
            if self._index 

[→ Get the Agent SDK Cookbook — $49](https://shoutfirst.gumroad.com/l/ogxhmy?utm_source=claudeguide&utm_medium=article&utm_campaign=claude-agent-testing-eval)

*30-day money-back guarantee. Instant download.*