AI API Gateway: Load Balancing, Fallback, Cost Tracking, Observability

#ai #machinelearning #llm

This article was originally published on AI Study Room. For the full version with working code examples and related articles, visit the original post.

AI API Gateway: Load Balancing, Fallback, Cost Tracking, Observability

Introduction

As organizations adopt multiple LLM providers (Anthropic, OpenAI, Google, open-source self-hosted), managing each directly from application code becomes unsustainable. An AI API gateway provides a unified interface for routing requests across providers, handling failures, tracking costs, and monitoring usage. This article covers the design of a production-grade AI gateway.

Unified API Layer

The gateway presents a single API that abstracts provider differences:

from abc import ABC, abstractmethod

from dataclasses import dataclass

from typing import Optional

import time

@dataclass

class LLMRequest:

    model: str

    messages: list[dict]

    max_tokens: int = 1024

    temperature: float = 0.7

    stream: bool = False

@dataclass

class LLMResponse:

    content: str

    model: str

    provider: str

    latency_ms: float

    tokens_in: int

    tokens_out: int

    cost: float

class LLMProvider(ABC):

    @abstractmethod

    async def complete(self, request: LLMRequest) -> LLMResponse:

        pass

    @abstractmethod

    def get_cost_per_token(self, model: str) -> tuple[float, float]:

        pass

class AnthropicProvider(LLMProvider):

    def __init__(self, api_key: str):

        self.client = Anthropic(api_key=api_key)

    async def complete(self, request: LLMRequest) -> LLMResponse:

        start = time.time()

        response = await self.client.messages.create(

            model=request.model,

            max_tokens=request.max_tokens,

            temperature=request.temperature,

            messages=request.messages,

        )

        latency = (time.time() - start) * 1000

        return LLMResponse(

            content=response.content[0].text,

            model=request.model,

            provider="anthropic",

            latency_ms=latency,

            tokens_in=response.usage.input_tokens,

            tokens_out=response.usage.output_tokens,

            cost=self._calculate_cost(response.usage),

        )

class OpenAIProvider(LLMProvider):

    def __init__(self, api_key: str):

        self.client = OpenAI(api_key=api_key)

    async def complete(self, request: LLMRequest) -> LLMResponse:

        start = time.time()

        response = await self.client.chat.completions.create(

            model=request.model,

            max_tokens=request.max_tokens,

            temperature=request.temperature,

            messages=request.messages,

        )

        latency = (time.time() - start) * 1000

        return LLMResponse(

            content=response.choices[0].message.content,

            model=request.model,

            provider="openai",

            latency_ms=latency,

            tokens_in=response.usage.prompt_tokens,

            tokens_out=response.usage.completion_tokens,

            cost=self._calculate_cost(response.usage),

        )

Load Balancing

Distribute requests across providers based on strategy:

class LoadBalancer:

    def __init__(self, providers: dict[str, LLMProvider]):

        self.providers = providers

        self.round_robin_index = 0

    async def route(self, request: LLMRequest, strategy: str = "priority") -> LLMResponse:

        if strategy == "cheapest":

            return await self._route_cheapest(request)

        elif strategy == "fastest":

            return await self._route_fastest(request)

        elif strategy == "round_robin":

            return await self._route_round_robin(request)

        else:

            return await self._route_priority(request)

    async def _route_priority(self, request: LLMRequest) -> LLMResponse:

        # Try primary provider first, fall back to secondary

        for name in ["primary", "secondary", "fallback"]:

            if name in self.providers:

                try:

                    return await self.providers[name].complete(request)

                except Exception:

                    continue

        raise AllProvidersExhausted("All providers failed")

    async def _route_cheapest(self, request: LLMRequest) -> LLMResponse:

        # Route to the provider with lowest cost for comparable quality

        cheap_providers = sorted(

            self.providers.items(),

            key=lambda p: p[1].get_cost_per_token(request.model)[0],

        )

        for name, provider in cheap_providers:

            try:

                return await provider.complete(request)

            except Exception:

                continue

        raise AllProv

Read the full article on AI Study Room for complete code examples, comparison tables, and related resources.

Found this useful? Check out more developer guides and tool comparisons on AI Study Room.