This article was originally published on AI Study Room. For the full version with working code examples and related articles, visit the original post.
AI API Gateway: Load Balancing, Fallback, Cost Tracking, Observability
Introduction
As organizations adopt multiple LLM providers (Anthropic, OpenAI, Google, open-source self-hosted), managing each directly from application code becomes unsustainable. An AI API gateway provides a unified interface for routing requests across providers, handling failures, tracking costs, and monitoring usage. This article covers the design of a production-grade AI gateway.
Unified API Layer
The gateway presents a single API that abstracts provider differences:
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
import time
@dataclass
class LLMRequest:
model: str
messages: list[dict]
max_tokens: int = 1024
temperature: float = 0.7
stream: bool = False
@dataclass
class LLMResponse:
content: str
model: str
provider: str
latency_ms: float
tokens_in: int
tokens_out: int
cost: float
class LLMProvider(ABC):
@abstractmethod
async def complete(self, request: LLMRequest) -> LLMResponse:
pass
@abstractmethod
def get_cost_per_token(self, model: str) -> tuple[float, float]:
pass
class AnthropicProvider(LLMProvider):
def __init__(self, api_key: str):
self.client = Anthropic(api_key=api_key)
async def complete(self, request: LLMRequest) -> LLMResponse:
start = time.time()
response = await self.client.messages.create(
model=request.model,
max_tokens=request.max_tokens,
temperature=request.temperature,
messages=request.messages,
)
latency = (time.time() - start) * 1000
return LLMResponse(
content=response.content[0].text,
model=request.model,
provider="anthropic",
latency_ms=latency,
tokens_in=response.usage.input_tokens,
tokens_out=response.usage.output_tokens,
cost=self._calculate_cost(response.usage),
)
class OpenAIProvider(LLMProvider):
def __init__(self, api_key: str):
self.client = OpenAI(api_key=api_key)
async def complete(self, request: LLMRequest) -> LLMResponse:
start = time.time()
response = await self.client.chat.completions.create(
model=request.model,
max_tokens=request.max_tokens,
temperature=request.temperature,
messages=request.messages,
)
latency = (time.time() - start) * 1000
return LLMResponse(
content=response.choices[0].message.content,
model=request.model,
provider="openai",
latency_ms=latency,
tokens_in=response.usage.prompt_tokens,
tokens_out=response.usage.completion_tokens,
cost=self._calculate_cost(response.usage),
)
Load Balancing
Distribute requests across providers based on strategy:
class LoadBalancer:
def __init__(self, providers: dict[str, LLMProvider]):
self.providers = providers
self.round_robin_index = 0
async def route(self, request: LLMRequest, strategy: str = "priority") -> LLMResponse:
if strategy == "cheapest":
return await self._route_cheapest(request)
elif strategy == "fastest":
return await self._route_fastest(request)
elif strategy == "round_robin":
return await self._route_round_robin(request)
else:
return await self._route_priority(request)
async def _route_priority(self, request: LLMRequest) -> LLMResponse:
# Try primary provider first, fall back to secondary
for name in ["primary", "secondary", "fallback"]:
if name in self.providers:
try:
return await self.providers[name].complete(request)
except Exception:
continue
raise AllProvidersExhausted("All providers failed")
async def _route_cheapest(self, request: LLMRequest) -> LLMResponse:
# Route to the provider with lowest cost for comparable quality
cheap_providers = sorted(
self.providers.items(),
key=lambda p: p[1].get_cost_per_token(request.model)[0],
)
for name, provider in cheap_providers:
try:
return await provider.complete(request)
except Exception:
continue
raise AllProv
Read the full article on AI Study Room for complete code examples, comparison tables, and related resources.
Found this useful? Check out more developer guides and tool comparisons on AI Study Room.
Top comments (0)