One of the most common engineering problems I see when teams scale AI from pilot to production: the cloud bill is 5–10× what the pilot suggested it would be.
This isn't a coincidence or a billing error. It's the predictable result of pilot conditions not reflecting production conditions — cleaner inputs, smaller volumes, no monitoring overhead, no retry logic, no agent orchestration multiplication.
Here are the 4 implementation patterns that fix it. All of them can be implemented without changing the AI output quality for the user.
Pattern 1: Semantic Caching
The cheapest LLM call is one you don't make. Semantic caching returns cached responses for queries that are semantically similar to ones you've already answered.
For enterprise AI tools where users ask similar questions repeatedly (HR policy lookup, product specs, internal knowledge queries), cache hit rates of 20–30% are achievable. Each hit is zero cost.
import numpy as np
from openai import OpenAI
from typing import Optional
import hashlib
import json
client = OpenAI()
class SemanticCache:
def __init__(self, similarity_threshold: float = 0.92):
self.threshold = similarity_threshold
self.cache: list[dict] = [] # In production: use Redis or Pinecone
def _embed(self, text: str) -> list[float]:
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def get(self, query: str) -> Optional[str]:
if not self.cache:
return None
query_embedding = self._embed(query)
for entry in self.cache:
similarity = self._cosine_similarity(query_embedding, entry["embedding"])
if similarity >= self.threshold:
return entry["response"] # Cache hit — zero LLM cost
return None
def set(self, query: str, response: str) -> None:
self.cache.append({
"query": query,
"embedding": self._embed(query),
"response": response
})
# Usage
cache = SemanticCache(similarity_threshold=0.92)
def cached_llm_call(query: str, system_prompt: str) -> dict:
cached = cache.get(query)
if cached:
return {"response": cached, "source": "cache", "cost": 0.0}
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": query}
]
)
answer = response.choices[0].message.content
cache.set(query, answer)
# Calculate approximate cost
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
cost = (input_tokens * 0.0000025) + (output_tokens * 0.00001) # GPT-4o pricing
return {"response": answer, "source": "llm", "cost": cost}
Expected savings: 15–30% of LLM calls, depending on query repetition rate.
Pattern 2: Query-Complexity-Based Model Routing
Not every query needs GPT-4o. Simple classification, template filling, and routine Q&A can run on smaller models with no quality difference for the user.
from enum import Enum
from dataclasses import dataclass
class QueryComplexity(Enum):
SIMPLE = "simple" # → gpt-4o-mini or similar (~10x cheaper)
MODERATE = "moderate" # → gpt-4o
COMPLEX = "complex" # → gpt-4o or o1-preview for reasoning
@dataclass
class RoutingDecision:
complexity: QueryComplexity
model: str
rationale: str
class ModelRouter:
# Signals that suggest higher complexity
COMPLEX_SIGNALS = [
"analyse", "compare", "synthesise", "evaluate", "reason",
"explain why", "what are the implications", "trade-off",
"multi-step", "infer", "hypothesis"
]
SIMPLE_SIGNALS = [
"what is", "define", "list", "when", "who", "where",
"how many", "summarise this", "translate", "format"
]
def route(self, query: str, context_length: int = 0) -> RoutingDecision:
query_lower = query.lower()
complex_score = sum(1 for s in self.COMPLEX_SIGNALS if s in query_lower)
simple_score = sum(1 for s in self.SIMPLE_SIGNALS if s in query_lower)
# Long context = more likely to need powerful model
if context_length > 8000 or complex_score >= 2:
return RoutingDecision(
complexity=QueryComplexity.COMPLEX,
model="gpt-4o",
rationale=f"Complex signals: {complex_score}, context: {context_length}"
)
elif simple_score >= 2 and complex_score == 0 and context_length < 2000:
return RoutingDecision(
complexity=QueryComplexity.SIMPLE,
model="gpt-4o-mini",
rationale=f"Simple signals: {simple_score}"
)
else:
return RoutingDecision(
complexity=QueryComplexity.MODERATE,
model="gpt-4o-mini", # Default to cheaper model for moderate
rationale="Moderate complexity"
)
def routed_llm_call(query: str, context: str, system_prompt: str) -> dict:
router = ModelRouter()
decision = router.route(query, len(context))
response = client.chat.completions.create(
model=decision.model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{context}\n\nQuery: {query}"}
]
)
return {
"response": response.choices[0].message.content,
"model_used": decision.model,
"routing_rationale": decision.rationale,
"tokens_used": response.usage.total_tokens
}
Expected savings: 30–40% of inference costs for mixed-complexity query distributions.
Pattern 3: Prompt Compression Measurement
Before compressing, measure. The goal is to identify which parts of your prompts are high-information vs redundant.
def measure_prompt_efficiency(
original_prompt: str,
compressed_prompt: str,
test_queries: list[str],
system_prompt: str
) -> dict:
"""
Measure: does the compressed prompt produce equivalent outputs?
Run BEFORE switching to compressed prompt in production.
"""
original_tokens = len(original_prompt.split()) * 1.3 # rough token estimate
compressed_tokens = len(compressed_prompt.split()) * 1.3
token_reduction = (original_tokens - compressed_tokens) / original_tokens
quality_scores = []
for query in test_queries[:20]: # sample evaluation
orig_response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": original_prompt + "\n\n" + query}
]
).choices[0].message.content
comp_response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": compressed_prompt + "\n\n" + query}
]
).choices[0].message.content
# Simple length/structure similarity as proxy — replace with semantic similarity
# in production evaluation
length_ratio = len(comp_response) / max(len(orig_response), 1)
quality_scores.append(min(length_ratio, 1/length_ratio))
avg_quality = sum(quality_scores) / len(quality_scores)
return {
"token_reduction_pct": f"{token_reduction:.1%}",
"estimated_monthly_savings_at_1M_calls": f"${token_reduction * 1000 * 0.0000025 * 1_000_000:.0f}",
"avg_quality_retention": f"{avg_quality:.2%}",
"recommendation": "SAFE TO DEPLOY" if avg_quality > 0.85 else "REVIEW BEFORE DEPLOYING"
}
Expected savings: 20–30% of token costs with well-compressed prompts. Run evaluation before switching.
Pattern 4: Cost Monitoring Dashboard (Catch Spiral Early)
The most important pattern: know your cost per query, per model, per feature — in real time.
from datetime import datetime
from collections import defaultdict
class AIcostMonitor:
# Pricing per 1K tokens (update with current pricing)
MODEL_COSTS = {
"gpt-4o": {"input": 0.0025, "output": 0.01},
"gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
"text-embedding-3-small": {"input": 0.00002, "output": 0.0}
}
def __init__(self, alert_threshold_daily_usd: float = 100.0):
self.calls: list[dict] = []
self.alert_threshold = alert_threshold_daily_usd
def log_call(self, model: str, input_tokens: int,
output_tokens: int, feature: str):
pricing = self.MODEL_COSTS.get(model, {"input": 0.003, "output": 0.012})
cost = (input_tokens/1000 * pricing["input"]) + \
(output_tokens/1000 * pricing["output"])
self.calls.append({
"timestamp": datetime.utcnow().isoformat(),
"model": model,
"feature": feature,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cost_usd": cost
})
def daily_summary(self) -> dict:
today = datetime.utcnow().date().isoformat()
today_calls = [c for c in self.calls if c["timestamp"].startswith(today)]
by_feature = defaultdict(float)
by_model = defaultdict(float)
total = sum(c["cost_usd"] for c in today_calls)
for call in today_calls:
by_feature[call["feature"]] += call["cost_usd"]
by_model[call["model"]] += call["cost_usd"]
alert = total > self.alert_threshold
return {
"date": today,
"total_cost_usd": round(total, 4),
"alert": alert,
"alert_message": f"ALERT: Daily cost ${total:.2f} exceeds threshold ${self.alert_threshold}" if alert else None,
"by_feature": dict(sorted(by_feature.items(), key=lambda x: -x[1])),
"by_model": dict(sorted(by_model.items(), key=lambda x: -x[1])),
"projected_monthly": round(total * 30, 2)
}
The Decision Tree
Query arrives
│
├── Check semantic cache
│ ├── HIT → return cached response (cost: $0)
│ └── MISS → continue
│
├── Route by complexity
│ ├── SIMPLE → gpt-4o-mini (10x cheaper)
│ ├── MODERATE → gpt-4o-mini
│ └── COMPLEX → gpt-4o
│
├── Apply compressed prompt template
│
├── Call LLM
│
├── Log to cost monitor
│
└── Store in cache for future hits
Implement these 4 patterns in sequence. Start with the monitor (you can't optimise what you can't measure), then caching (biggest immediate impact), then routing, then prompt compression.
What's your current cost per 1,000 queries in production? Curious what numbers people are seeing across different use cases.
Sunil — CEO, Ailoitte. We build cost-optimised AI architectures for funded startups. ailoitte.com
Top comments (0)