DEV Community

myougaTheAxo
myougaTheAxo

Posted on

AI API Rate Limiting and Cost Management: Practical Patterns with Claude Code

Why AI API Cost Management Matters

LLM API pricing is usage-based. Bad design leads to surprise bills at month-end. In production, cost management and rate limiting must be designed in from the start.

Claude API pricing (2026):

  • Claude Opus: $15/MTok input, $75/MTok output
  • Claude Sonnet: $3/MTok input, $15/MTok output
  • Claude Haiku: $0.25/MTok input, $1.25/MTok output

"Just use Opus for everything" is the most common failure. Using models appropriate to the task changes costs by 10-60x.

Token Usage Measurement

from dataclasses import dataclass, field
from datetime import datetime, date
from collections import defaultdict

@dataclass
class TokenUsage:
    model: str
    input_tokens: int
    output_tokens: int
    cost_usd: float
    timestamp: datetime = field(default_factory=datetime.utcnow)

MODEL_PRICING = {
    "claude-opus-4-5":   {"input": 15.0,  "output": 75.0},
    "claude-sonnet-4-5": {"input": 3.0,   "output": 15.0},
    "claude-haiku-4-5":  {"input": 0.25,  "output": 1.25},
}

def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    pricing = MODEL_PRICING.get(model, {"input": 3.0, "output": 15.0})
    return (
        input_tokens * pricing["input"] / 1_000_000
        + output_tokens * pricing["output"] / 1_000_000
    )

class CostTracker:
    def __init__(self):
        self._daily: dict[date, list[TokenUsage]] = defaultdict(list)
        self._total_cost: float = 0.0

    def record(self, model: str, input_tokens: int, output_tokens: int) -> float:
        cost = calculate_cost(model, input_tokens, output_tokens)
        self._daily[date.today()].append(TokenUsage(model, input_tokens, output_tokens, cost))
        self._total_cost += cost
        return cost

    def daily_cost(self) -> float:
        return sum(u.cost_usd for u in self._daily.get(date.today(), []))
Enter fullscreen mode Exit fullscreen mode

Token Bucket Rate Limiter

import asyncio
import time
from dataclasses import dataclass, field

@dataclass
class RateLimiter:
    requests_per_minute: int
    tokens_per_minute: int
    _request_tokens: float = field(init=False)
    _token_tokens: float = field(init=False)
    _last_refill: float = field(init=False)
    _lock: asyncio.Lock = field(init=False)

    def __post_init__(self):
        self._request_tokens = float(self.requests_per_minute)
        self._token_tokens = float(self.tokens_per_minute)
        self._last_refill = time.monotonic()
        self._lock = asyncio.Lock()

    def _refill(self):
        now = time.monotonic()
        elapsed = now - self._last_refill
        self._last_refill = now
        self._request_tokens = min(self.requests_per_minute,
            self._request_tokens + elapsed * self.requests_per_minute / 60)
        self._token_tokens = min(self.tokens_per_minute,
            self._token_tokens + elapsed * self.tokens_per_minute / 60)

    async def acquire(self, estimated_tokens: int = 1000) -> None:
        async with self._lock:
            while True:
                self._refill()
                if self._request_tokens >= 1 and self._token_tokens >= estimated_tokens:
                    self._request_tokens -= 1
                    self._token_tokens -= estimated_tokens
                    return
                wait_time = max(
                    (1 - self._request_tokens) * 60 / self.requests_per_minute,
                    (estimated_tokens - self._token_tokens) * 60 / self.tokens_per_minute,
                )
                await asyncio.sleep(min(wait_time, 1.0))
Enter fullscreen mode Exit fullscreen mode

Cache and Model Selection Optimization

import hashlib

class LLMCache:
    def __init__(self, redis_client=None):
        self._memory: dict[str, str] = {}
        self.redis = redis_client

    def _cache_key(self, model: str, messages: list[dict]) -> str:
        content = json.dumps({"model": model, "messages": messages}, sort_keys=True)
        return hashlib.sha256(content.encode()).hexdigest()

    async def get(self, model: str, messages: list[dict]) -> str | None:
        key = self._cache_key(model, messages)
        return self._memory.get(key)

    async def set(self, model: str, messages: list[dict], response: str) -> None:
        key = self._cache_key(model, messages)
        self._memory[key] = response

def select_model_by_complexity(prompt: str) -> str:
    length = len(prompt)
    if length < 500:
        return "claude-haiku-4-5"   # Simple tasks
    if length < 2000:
        return "claude-sonnet-4-5"  # Medium tasks
    return "claude-opus-4-5"        # Complex tasks
Enter fullscreen mode Exit fullscreen mode

Budget Alerts

class BudgetGuard:
    def __init__(self, daily_limit_usd: float, alert_threshold: float = 0.8, on_exceeded=None):
        self.daily_limit = daily_limit_usd
        self.alert_threshold = alert_threshold
        self.on_exceeded = on_exceeded

    async def check(self) -> None:
        current = tracker.daily_cost()
        ratio = current / self.daily_limit

        if ratio >= 1.0:
            if self.on_exceeded:
                await self.on_exceeded(current, self.daily_limit)
            raise Exception(f"Daily budget exceeded: ${current:.2f} / ${self.daily_limit:.2f}")

budget_guard = BudgetGuard(daily_limit_usd=10.0)
Enter fullscreen mode Exit fullscreen mode

Cost management implementation is unglamorous but essential for production. Early adoption prevents billing shock at scale.


This article is from the Claude Code Complete Guide (7 chapters) on note.com.
myouga (@myougatheaxo) - VTuber axolotl. Sharing practical AI development tips.

Top comments (0)