DEV Community

Nebula
Nebula

Posted on

How to Test AI Agent Tool Calls with Pytest

Your AI agent calls the right tool in development. Then it picks the wrong one in production, sends a Slack message instead of querying your database, and you have no idea why.

The problem: LLM responses are non-deterministic. You can't write a traditional test that says "given this input, expect this exact output." So most developers skip testing their agents entirely.

The fix: don't test the LLM. Mock it. Test everything around it — the tool routing, the argument extraction, the result handling — deterministically with pytest.

Here's how in under 5 minutes.

The Agent You're Testing

Let's say you have a simple agent that takes a user message, sends it to an LLM with a list of tools, and executes whichever tool the LLM picks:

# agent.py
import json
from openai import OpenAI

TOOLS = {
    "search_docs": lambda query: f"Results for: {query}",
    "create_ticket": lambda title, priority="medium": f"Ticket created: {title} [{priority}]",
    "send_notification": lambda message, channel: f"Sent to {channel}: {message}",
}

TOOL_SCHEMAS = [
    {
        "type": "function",
        "function": {
            "name": "search_docs",
            "description": "Search the documentation",
            "parameters": {
                "type": "object",
                "properties": {"query": {"type": "string"}},
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "create_ticket",
            "description": "Create a support ticket",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "priority": {"type": "string", "enum": ["low", "medium", "high"]},
                },
                "required": ["title"],
            },
        },
    },
]


def run_agent(user_message: str, client: OpenAI | None = None) -> str:
    client = client or OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": user_message}],
        tools=TOOL_SCHEMAS,
        tool_choice="auto",
    )

    message = response.choices[0].message

    if not message.tool_calls:
        return message.content or ""

    tool_call = message.tool_calls[0]
    func_name = tool_call.function.name
    func_args = json.loads(tool_call.function.arguments)

    if func_name not in TOOLS:
        return f"Unknown tool: {func_name}"

    return TOOLS[func_name](**func_args)
Enter fullscreen mode Exit fullscreen mode

Nothing fancy. No framework. Just a function that calls OpenAI, checks if a tool was selected, and runs it.

The question is: how do you test that run_agent("search for auth docs") actually calls search_docs with the right query — without making a real API call?

Mock the LLM, Test the Routing

The trick: create a mock that returns a fake OpenAI response with the exact tool call you want to test. Then verify your agent handles it correctly.

# test_agent.py
import json
import pytest
from unittest.mock import MagicMock, patch
from agent import run_agent


def make_tool_call_response(tool_name: str, arguments: dict):
    """Build a fake OpenAI response that triggers a specific tool call."""
    mock_tool_call = MagicMock()
    mock_tool_call.function.name = tool_name
    mock_tool_call.function.arguments = json.dumps(arguments)

    mock_message = MagicMock()
    mock_message.tool_calls = [mock_tool_call]
    mock_message.content = None

    mock_response = MagicMock()
    mock_response.choices = [MagicMock(message=mock_message)]
    return mock_response


def make_text_response(content: str):
    """Build a fake OpenAI response with plain text (no tool call)."""
    mock_message = MagicMock()
    mock_message.tool_calls = None
    mock_message.content = content

    mock_response = MagicMock()
    mock_response.choices = [MagicMock(message=mock_message)]
    return mock_response
Enter fullscreen mode Exit fullscreen mode

Two helper functions. make_tool_call_response simulates the LLM choosing a tool. make_text_response simulates a plain answer. That's your entire test infrastructure.

Write Your First Tool Call Test

Now test that your agent routes to search_docs with the right arguments:

def test_agent_routes_to_search():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "search_docs", {"query": "authentication guide"}
    )

    result = run_agent("find docs about authentication", client=mock_client)

    assert result == "Results for: authentication guide"
    mock_client.chat.completions.create.assert_called_once()
Enter fullscreen mode Exit fullscreen mode

This test is deterministic. It runs in milliseconds. It costs zero API calls. And it proves that when the LLM selects search_docs with {"query": "authentication guide"}, your agent executes it correctly and returns the right result.

Test Every Routing Path

One test isn't enough. Cover the paths that matter:

def test_agent_routes_to_create_ticket():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "create_ticket", {"title": "Login broken", "priority": "high"}
    )

    result = run_agent("create a high priority ticket for login broken", client=mock_client)

    assert result == "Ticket created: Login broken [high]"


def test_agent_handles_text_response():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_text_response(
        "I can help you with that. What would you like to search for?"
    )

    result = run_agent("hello", client=mock_client)

    assert "help you" in result


def test_agent_handles_unknown_tool():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "nonexistent_tool", {"arg": "value"}
    )

    result = run_agent("do something weird", client=mock_client)

    assert result == "Unknown tool: nonexistent_tool"
Enter fullscreen mode Exit fullscreen mode

Three scenarios, three deterministic tests:

  1. Correct tool with arguments — verifies the happy path
  2. No tool call — verifies your agent handles plain text gracefully
  3. Unknown tool — verifies your agent doesn't crash on unexpected tool names

Run them:

$ pytest test_agent.py -v

test_agent.py::test_agent_routes_to_search PASSED
test_agent.py::test_agent_routes_to_create_ticket PASSED
test_agent.py::test_agent_handles_text_response PASSED
test_agent.py::test_agent_handles_unknown_tool PASSED

4 passed in 0.03s
Enter fullscreen mode Exit fullscreen mode

Four tests, 30 milliseconds, zero dollars.

Parametrize for Coverage

If you have many tools, use pytest.mark.parametrize to avoid writing repetitive test functions:

@pytest.mark.parametrize("tool_name, args, expected", [
    ("search_docs", {"query": "API reference"}, "Results for: API reference"),
    ("create_ticket", {"title": "Bug"}, "Ticket created: Bug [medium]"),
    ("create_ticket", {"title": "Outage", "priority": "high"}, "Ticket created: Outage [high]"),
])
def test_agent_tool_routing(tool_name, args, expected):
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        tool_name, args
    )

    result = run_agent("test input", client=mock_client)

    assert result == expected
Enter fullscreen mode Exit fullscreen mode

Three tool-argument combinations tested with one function. Add rows to the list as you add tools.

What You're Actually Testing

Let's be clear about what this approach covers and what it doesn't:

What this tests What this doesn't test
Tool routing logic Whether the LLM picks the right tool
Argument parsing from JSON Prompt quality or tool descriptions
Error handling for bad tool names LLM reasoning or accuracy
Return value formatting Non-deterministic LLM behavior

This is the right split. The deterministic parts (routing, parsing, error handling) belong in unit tests. The non-deterministic parts (does the LLM pick the right tool?) belong in evals — a separate concern with different tooling.

Your unit tests catch code bugs. Evals catch prompt bugs. Don't mix them.

The Full Test File

Here's everything in one copy-paste block:

# test_agent.py
import json
import pytest
from unittest.mock import MagicMock
from agent import run_agent


def make_tool_call_response(tool_name: str, arguments: dict):
    mock_tool_call = MagicMock()
    mock_tool_call.function.name = tool_name
    mock_tool_call.function.arguments = json.dumps(arguments)
    mock_message = MagicMock()
    mock_message.tool_calls = [mock_tool_call]
    mock_message.content = None
    mock_response = MagicMock()
    mock_response.choices = [MagicMock(message=mock_message)]
    return mock_response


def make_text_response(content: str):
    mock_message = MagicMock()
    mock_message.tool_calls = None
    mock_message.content = content
    mock_response = MagicMock()
    mock_response.choices = [MagicMock(message=mock_message)]
    return mock_response


def test_agent_routes_to_search():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "search_docs", {"query": "authentication guide"}
    )
    result = run_agent("find docs about auth", client=mock_client)
    assert result == "Results for: authentication guide"


def test_agent_routes_to_create_ticket():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "create_ticket", {"title": "Login broken", "priority": "high"}
    )
    result = run_agent("ticket for login broken", client=mock_client)
    assert result == "Ticket created: Login broken [high]"


def test_agent_handles_text_response():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_text_response(
        "How can I help?"
    )
    result = run_agent("hello", client=mock_client)
    assert result == "How can I help?"


def test_agent_handles_unknown_tool():
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        "nonexistent_tool", {}
    )
    result = run_agent("do something", client=mock_client)
    assert result == "Unknown tool: nonexistent_tool"


@pytest.mark.parametrize("tool_name, args, expected", [
    ("search_docs", {"query": "API ref"}, "Results for: API ref"),
    ("create_ticket", {"title": "Bug"}, "Ticket created: Bug [medium]"),
    ("create_ticket", {"title": "Outage", "priority": "high"}, "Ticket created: Outage [high]"),
])
def test_tool_routing_parametrized(tool_name, args, expected):
    mock_client = MagicMock()
    mock_client.chat.completions.create.return_value = make_tool_call_response(
        tool_name, args
    )
    result = run_agent("test", client=mock_client)
    assert result == expected
Enter fullscreen mode Exit fullscreen mode

No frameworks. No API keys. No flaky tests. Just pytest and unittest.mock.

The pattern works for any agent, regardless of which LLM provider you use — swap out the OpenAI mock structure for Anthropic or any other SDK. The principle is the same: mock the non-deterministic layer, test the deterministic code around it.

Tools like Nebula handle tool routing and retry logic for you automatically, but when you're building custom agents, this testing pattern is the fastest way to catch routing bugs before they hit production.


This is part of the AI Agent Quick Tips series — short, practical tutorials for building production AI agents.

Top comments (0)