Why AI API Cost Management Matters
LLM API pricing is usage-based. Bad design leads to surprise bills at month-end. In production, cost management and rate limiting must be designed in from the start.
Claude API pricing (2026):
- Claude Opus: $15/MTok input, $75/MTok output
- Claude Sonnet: $3/MTok input, $15/MTok output
- Claude Haiku: $0.25/MTok input, $1.25/MTok output
"Just use Opus for everything" is the most common failure. Using models appropriate to the task changes costs by 10-60x.
Token Usage Measurement
from dataclasses import dataclass, field
from datetime import datetime, date
from collections import defaultdict
@dataclass
class TokenUsage:
model: str
input_tokens: int
output_tokens: int
cost_usd: float
timestamp: datetime = field(default_factory=datetime.utcnow)
MODEL_PRICING = {
"claude-opus-4-5": {"input": 15.0, "output": 75.0},
"claude-sonnet-4-5": {"input": 3.0, "output": 15.0},
"claude-haiku-4-5": {"input": 0.25, "output": 1.25},
}
def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
pricing = MODEL_PRICING.get(model, {"input": 3.0, "output": 15.0})
return (
input_tokens * pricing["input"] / 1_000_000
+ output_tokens * pricing["output"] / 1_000_000
)
class CostTracker:
def __init__(self):
self._daily: dict[date, list[TokenUsage]] = defaultdict(list)
self._total_cost: float = 0.0
def record(self, model: str, input_tokens: int, output_tokens: int) -> float:
cost = calculate_cost(model, input_tokens, output_tokens)
self._daily[date.today()].append(TokenUsage(model, input_tokens, output_tokens, cost))
self._total_cost += cost
return cost
def daily_cost(self) -> float:
return sum(u.cost_usd for u in self._daily.get(date.today(), []))
Token Bucket Rate Limiter
import asyncio
import time
from dataclasses import dataclass, field
@dataclass
class RateLimiter:
requests_per_minute: int
tokens_per_minute: int
_request_tokens: float = field(init=False)
_token_tokens: float = field(init=False)
_last_refill: float = field(init=False)
_lock: asyncio.Lock = field(init=False)
def __post_init__(self):
self._request_tokens = float(self.requests_per_minute)
self._token_tokens = float(self.tokens_per_minute)
self._last_refill = time.monotonic()
self._lock = asyncio.Lock()
def _refill(self):
now = time.monotonic()
elapsed = now - self._last_refill
self._last_refill = now
self._request_tokens = min(self.requests_per_minute,
self._request_tokens + elapsed * self.requests_per_minute / 60)
self._token_tokens = min(self.tokens_per_minute,
self._token_tokens + elapsed * self.tokens_per_minute / 60)
async def acquire(self, estimated_tokens: int = 1000) -> None:
async with self._lock:
while True:
self._refill()
if self._request_tokens >= 1 and self._token_tokens >= estimated_tokens:
self._request_tokens -= 1
self._token_tokens -= estimated_tokens
return
wait_time = max(
(1 - self._request_tokens) * 60 / self.requests_per_minute,
(estimated_tokens - self._token_tokens) * 60 / self.tokens_per_minute,
)
await asyncio.sleep(min(wait_time, 1.0))
Cache and Model Selection Optimization
import hashlib
class LLMCache:
def __init__(self, redis_client=None):
self._memory: dict[str, str] = {}
self.redis = redis_client
def _cache_key(self, model: str, messages: list[dict]) -> str:
content = json.dumps({"model": model, "messages": messages}, sort_keys=True)
return hashlib.sha256(content.encode()).hexdigest()
async def get(self, model: str, messages: list[dict]) -> str | None:
key = self._cache_key(model, messages)
return self._memory.get(key)
async def set(self, model: str, messages: list[dict], response: str) -> None:
key = self._cache_key(model, messages)
self._memory[key] = response
def select_model_by_complexity(prompt: str) -> str:
length = len(prompt)
if length < 500:
return "claude-haiku-4-5" # Simple tasks
if length < 2000:
return "claude-sonnet-4-5" # Medium tasks
return "claude-opus-4-5" # Complex tasks
Budget Alerts
class BudgetGuard:
def __init__(self, daily_limit_usd: float, alert_threshold: float = 0.8, on_exceeded=None):
self.daily_limit = daily_limit_usd
self.alert_threshold = alert_threshold
self.on_exceeded = on_exceeded
async def check(self) -> None:
current = tracker.daily_cost()
ratio = current / self.daily_limit
if ratio >= 1.0:
if self.on_exceeded:
await self.on_exceeded(current, self.daily_limit)
raise Exception(f"Daily budget exceeded: ${current:.2f} / ${self.daily_limit:.2f}")
budget_guard = BudgetGuard(daily_limit_usd=10.0)
Cost management implementation is unglamorous but essential for production. Early adoption prevents billing shock at scale.
This article is from the Claude Code Complete Guide (7 chapters) on note.com.
myouga (@myougatheaxo) - VTuber axolotl. Sharing practical AI development tips.
Top comments (0)