Question
This is the biggest single technical question you might have while using pydantic-ai:
Will pydantic-ai round-trip respect
output_typethrough Ollama's/v1/chat/completionscleanly? If yes, then you can rely on structured output.
If it can't, then you have to resort to manual validation & enforcement in your application — post-response. But fortunately that is not the case 😅.
Stack
- llama3.2:1b
- Ollama 0.17.7
- Python: 3.12
Findings
Imagine you have something like the following:
from __future__ import annotations
from pydantic_ai import Agent
from pydantic_ai.output import NativeOutput
from pydantic import BaseModel, Field
class WordExplanation(BaseModel):
"""Structured explanation of one word as it is used in a specific context."""
meaning: str = Field(
description="Concise dictionary-style definition of the word as used in context.",
)
simplified_explanation: str = Field(
description="Same idea expressed for a younger or non-native reader.",
)
synonyms: list[str] = Field(
default_factory=list,
description="Up to five words meaning roughly the same thing.",
)
antonyms: list[str] = Field(
default_factory=list,
description="Up to five words meaning roughly the opposite.",
)
provider = OpenAIProvider(
base_url="http://ollama:11434/v1",
api_key="some-key",
)
model = OpenAIChatModel("llama3.2:1b", provider=provider)
Agent(
model,
output_type=NativeOutput(WordExplanation),
model_settings=resolve_model_settings(settings),
)
NativeOutput works: pydantic-ai sends response_format={"type": "json_schema", ...} and Ollama's OpenAI Compatible layer honours it. Returned a valid WordExplanation.
Script I Used to Verify It
from __future__ import annotations
import asyncio
import os
import subprocess
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import httpx
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.output import NativeOutput, PromptedOutput, ToolOutput
from pydantic_ai.providers.openai import OpenAIProvider
LLM_BASE_URL = os.environ.get("LLM_BASE_URL", "http://localhost:11434/v1")
LLM_API_KEY = os.environ.get("LLM_API_KEY", "ollama")
LLM_MODEL = os.environ.get("LLM_MODEL", "llama3.2:1b")
SPIKE_WORD = os.environ.get("SPIKE_WORD", "ephemeral")
SPIKE_CONTEXT = os.environ.get(
"SPIKE_CONTEXT",
"The graffiti was ephemeral, washed away by the first rain of the season.",
)
SPIKE_AUTO_START = os.environ.get("SPIKE_AUTO_START", "true").lower() in {"1", "true", "yes"}
PROJECT_ROOT = Path(__file__).resolve().parent.parent
class WordExplanation(BaseModel):
"""Structured explanation of a single word in context."""
meaning: str = Field(
description="Concise dictionary-style definition of the word as used in context.",
)
simplified_explanation: str = Field(
description="Same idea expressed for a younger or non-native reader.",
)
synonyms: list[str] = Field(
default_factory=list,
description="Up to five words meaning roughly the same thing.",
)
antonyms: list[str] = Field(
default_factory=list,
description="Up to five words meaning roughly the opposite.",
)
SYSTEM_PROMPT = (
"You are a dictionary. Given a WORD and its CONTEXT, return the WordExplanation "
"matching how the word is used in that specific sentence. Prefer short, plain "
"language for `simplified_explanation`. Limit synonyms and antonyms to five each."
)
def build_user_prompt(word: str, context: str) -> str:
return f"WORD: {word}\nCONTEXT: {context}"
# ---------------------------------------------------------------------------
# Ollama plumbing
# ---------------------------------------------------------------------------
def _log(msg: str) -> None:
print(f"[spike] {msg}", flush=True)
def _endpoint_reachable(base_url: str, timeout: float = 2.0) -> bool:
"""True iff Ollama's ``/api/tags`` responds — cheaper than a real call."""
# `LLM_BASE_URL` includes the `/v1` OpenAI suffix; the native probe is on `/api/tags`.
root = base_url.rstrip("/").removesuffix("/v1").rstrip("/")
try:
resp = httpx.get(f"{root}/api/tags", timeout=timeout)
return resp.status_code == 200
except httpx.HTTPError:
return False
def _wait_for_endpoint(base_url: str, deadline_s: float) -> None:
start = time.monotonic()
while time.monotonic() - start < deadline_s:
if _endpoint_reachable(base_url):
_log(f"Ollama reachable at {base_url} after {time.monotonic() - start:.1f}s")
return
time.sleep(2)
raise TimeoutError(f"Ollama did not become reachable at {base_url} within {deadline_s}s")
def _start_ollama_via_compose() -> None:
"""Best-effort ``docker compose up -d ollama`` from the project root."""
_log("Ollama not reachable — running `docker compose up -d ollama`...")
result = subprocess.run(
["docker", "compose", "up", "-d", "ollama"],
cwd=PROJECT_ROOT,
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
_log(f"docker compose stderr:\n{result.stderr}")
raise RuntimeError("Failed to start ollama via docker compose")
_log("docker compose returned successfully; waiting for the endpoint...")
def ensure_ollama_ready() -> None:
if _endpoint_reachable(LLM_BASE_URL):
_log(f"Ollama already reachable at {LLM_BASE_URL}")
return
if not SPIKE_AUTO_START:
raise RuntimeError(
f"Ollama not reachable at {LLM_BASE_URL} and SPIKE_AUTO_START=false. "
"Start it manually with `docker compose up -d ollama`.",
)
_start_ollama_via_compose()
# The compose ollama image pre-pulls the model at build time, so the
# daemon only needs to boot — 60s is generous.
_wait_for_endpoint(LLM_BASE_URL, deadline_s=180.0)
# ---------------------------------------------------------------------------
# Spike runs
# ---------------------------------------------------------------------------
@dataclass
class RunOutcome:
label: str
ok: bool
latency_s: float
parsed: WordExplanation | None
error: str | None
def _build_model() -> OpenAIChatModel:
provider = OpenAIProvider(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
return OpenAIChatModel(LLM_MODEL, provider=provider)
async def _run_once(
label: str,
output_spec: Any,
per_call_timeout_s: float = 120.0,
) -> RunOutcome:
_log(f"--- Running mode: {label} ---")
agent = Agent(
_build_model(),
output_type=output_spec,
system_prompt=SYSTEM_PROMPT,
)
start = time.monotonic()
try:
result = await asyncio.wait_for(
agent.run(build_user_prompt(SPIKE_WORD, SPIKE_CONTEXT)),
timeout=per_call_timeout_s,
)
except TimeoutError:
latency = time.monotonic() - start
_log(f" timed out after {latency:.1f}s")
return RunOutcome(
label=label,
ok=False,
latency_s=latency,
parsed=None,
error=f"timeout after {per_call_timeout_s:.0f}s",
)
except Exception as exc:
latency = time.monotonic() - start
_log(f" failed after {latency:.1f}s: {type(exc).__name__}")
return RunOutcome(
label=label,
ok=False,
latency_s=latency,
parsed=None,
error=f"{type(exc).__name__}: {exc}",
)
latency = time.monotonic() - start
parsed = result.output
if not isinstance(parsed, WordExplanation):
_log(f" completed but returned {type(parsed).__name__}")
return RunOutcome(
label=label,
ok=False,
latency_s=latency,
parsed=None,
error=f"unexpected output type: {type(parsed).__name__}",
)
_log(f" ok in {latency:.1f}s")
return RunOutcome(
label=label,
ok=True,
latency_s=latency,
parsed=parsed,
error=None,
)
async def run_all() -> list[RunOutcome]:
return [
await _run_once("ToolOutput (default)", ToolOutput(WordExplanation)),
await _run_once("NativeOutput (JSON schema)", NativeOutput(WordExplanation)),
await _run_once("PromptedOutput (fallback)", PromptedOutput(WordExplanation)),
]
def _print_outcome(outcome: RunOutcome) -> None:
header = f"[{outcome.label}] {'OK' if outcome.ok else 'FAIL'} ({outcome.latency_s:.2f}s)"
print(header)
if outcome.ok and outcome.parsed is not None:
for field in ("meaning", "simplified_explanation"):
print(f" {field:<24} {getattr(outcome.parsed, field)}")
print(f" synonyms {outcome.parsed.synonyms}")
print(f" antonyms {outcome.parsed.antonyms}")
else:
print(f" error: {outcome.error}")
print()
def main() -> int:
_log(f"Target model : {LLM_MODEL}")
_log(f"Target endpoint: {LLM_BASE_URL}")
_log(f"Prompt word : {SPIKE_WORD!r}")
_log(f"Prompt context: {SPIKE_CONTEXT!r}")
print()
ensure_ollama_ready()
outcomes = asyncio.run(run_all())
print("=" * 72)
print("SPIKE RESULTS")
print("=" * 72)
for outcome in outcomes:
_print_outcome(outcome)
any_ok = any(o.ok for o in outcomes)
print("VERDICT:", "at least one mode works" if any_ok else "no mode worked — fallback needed")
return 0 if any_ok else 1
if __name__ == "__main__":
sys.exit(main())
And the compose.yml:
services:
ollama:
build:
context: ollama
dockerfile: Dockerfile
args:
OLLAMA_MODEL: ${OLLAMA_MODEL:-llama3.2:1b}
ports:
- "11434:11434" # Expose the OpenAI-compatible endpoint on the host for host-run scripts (e.g. scripts/spike_pydantic_ai_ollama.py).
healthcheck:
test: ["CMD-SHELL", "/usr/local/bin/healthcheck.sh"]
interval: 30s
timeout: 5s
retries: 5
start_period: 45s
You also need to create a ollama/Dockerfile:
# Kept self-contained here so developers can work without cloning the model over and over again.
FROM ollama/ollama:0.17.7
ARG OLLAMA_MODEL=llama3.2:1b
ENV OLLAMA_MODEL=${OLLAMA_MODEL}
# CAUTION: 0.0.0.0 means Ollama listens on every network interface — required for the healthcheck and for the container to be reachable from a sibling container.
ENV OLLAMA_HOST=0.0.0.0
ENV OLLAMA_PORT=11434
# curl for build-time readiness checks + the runtime healthcheck.
RUN apt-get update && \
apt-get install -y --no-install-recommends ca-certificates curl && \
rm -rf /var/lib/apt/lists/*
COPY --chmod=755 prepull-model.sh /usr/local/bin/prepull-model.sh
RUN /usr/local/bin/prepull-model.sh
COPY --chmod=755 healthcheck.sh /usr/local/bin/healthcheck.sh
HEALTHCHECK --interval=30s --timeout=5s --retries=5 --start-period=45s CMD ["/usr/local/bin/healthcheck.sh"]
# IMPORTANT: prepull-model.sh stops the daemon when it's done. We start a fresh one here so the container becomes usable.
CMD ["serve"]
And ollama/healthcheck.sh:
#!/bin/sh
set -eu
HOST="${OLLAMA_HOST:-127.0.0.1}"
PORT="${OLLAMA_PORT:-11434}"
MODEL="${OLLAMA_MODEL:-llama3.2:1b}"
# 1) API up?
if ! curl -fsS "http://${HOST}:${PORT}/api/version" >/dev/null; then
exit 1
fi
# 2) Model registered locally?
if ! curl -fsS "http://${HOST}:${PORT}/api/tags" | grep -q "\"name\":\"${MODEL}\""; then
exit 1
fi
# 3) /api/generate usable? (Minimal, cheap request; do not exercise the model.)
CODE="$(curl -sS -o /dev/null -w '%{http_code}' \
-H 'Content-Type: application/json' \
-X POST "http://${HOST}:${PORT}/api/generate" \
-d "{\"model\":\"${MODEL}\",\"prompt\":\"ping\",\"stream\":false,\"options\":{\"num_predict\":1}}")"
[ "$CODE" -eq 200 ]
Lastly ollama/prepull-model.sh:
#!/bin/sh
set -eu
HOST="${OLLAMA_HOST:-127.0.0.1}"
PORT="${OLLAMA_PORT:-11434}"
MODEL="${OLLAMA_MODEL:-llama3.2:1b}"
READY_TRIES="${OLLAMA_READY_TRIES:-60}"
READY_SLEEP="${OLLAMA_READY_SLEEP:-1}"
log() { printf '%s\n' "$*" >&2; }
cleanup() {
if [ -n "${PID:-}" ] && kill -0 "$PID" 2>/dev/null; then
kill -TERM "$PID" 2>/dev/null || true
wait "$PID" 2>/dev/null || true
fi
}
trap cleanup EXIT INT TERM
log "Starting ollama daemon for build-time model pull..."
ollama serve >/tmp/ollama-build.log 2>&1 &
PID=$!
i=0
while [ "$i" -lt "$READY_TRIES" ]; do
if curl -fsS "http://${HOST}:${PORT}/api/version" >/dev/null; then
break
fi
i=$((i+1))
sleep "$READY_SLEEP"
done
if ! curl -fsS "http://${HOST}:${PORT}/api/version" >/dev/null; then
log "Ollama daemon did not become ready during build."
log "Last log lines:"; tail -n 120 /tmp/ollama-build.log || true
exit 1
fi
log "Daemon is ready. Pulling model: ${MODEL}"
ollama pull "${MODEL}"
log "Listing models:"; ollama list || true
log "Done."
Top comments (0)