4 Engineering Patterns That Cut AI Inference Costs 60–80% Without Touching Output Quality

#ai #python #machinelearning #webdev

One of the most common engineering problems I see when teams scale AI from pilot to production: the cloud bill is 5–10× what the pilot suggested it would be.

This isn't a coincidence or a billing error. It's the predictable result of pilot conditions not reflecting production conditions — cleaner inputs, smaller volumes, no monitoring overhead, no retry logic, no agent orchestration multiplication.

Here are the 4 implementation patterns that fix it. All of them can be implemented without changing the AI output quality for the user.

Pattern 1: Semantic Caching

The cheapest LLM call is one you don't make. Semantic caching returns cached responses for queries that are semantically similar to ones you've already answered.

For enterprise AI tools where users ask similar questions repeatedly (HR policy lookup, product specs, internal knowledge queries), cache hit rates of 20–30% are achievable. Each hit is zero cost.

import numpy as np
from openai import OpenAI
from typing import Optional
import hashlib
import json

client = OpenAI()

class SemanticCache:
    def __init__(self, similarity_threshold: float = 0.92):
        self.threshold = similarity_threshold
        self.cache: list[dict] = []  # In production: use Redis or Pinecone

    def _embed(self, text: str) -> list[float]:
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding

    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        a, b = np.array(a), np.array(b)
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

    def get(self, query: str) -> Optional[str]:
        if not self.cache:
            return None
        query_embedding = self._embed(query)
        for entry in self.cache:
            similarity = self._cosine_similarity(query_embedding, entry["embedding"])
            if similarity >= self.threshold:
                return entry["response"]  # Cache hit — zero LLM cost
        return None

    def set(self, query: str, response: str) -> None:
        self.cache.append({
            "query": query,
            "embedding": self._embed(query),
            "response": response
        })

# Usage
cache = SemanticCache(similarity_threshold=0.92)

def cached_llm_call(query: str, system_prompt: str) -> dict:
    cached = cache.get(query)
    if cached:
        return {"response": cached, "source": "cache", "cost": 0.0}

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query}
        ]
    )
    answer = response.choices[0].message.content
    cache.set(query, answer)

    # Calculate approximate cost
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    cost = (input_tokens * 0.0000025) + (output_tokens * 0.00001)  # GPT-4o pricing

    return {"response": answer, "source": "llm", "cost": cost}

Expected savings: 15–30% of LLM calls, depending on query repetition rate.

Pattern 2: Query-Complexity-Based Model Routing

Not every query needs GPT-4o. Simple classification, template filling, and routine Q&A can run on smaller models with no quality difference for the user.

from enum import Enum
from dataclasses import dataclass

class QueryComplexity(Enum):
    SIMPLE = "simple"       # → gpt-4o-mini or similar  (~10x cheaper)
    MODERATE = "moderate"   # → gpt-4o
    COMPLEX = "complex"     # → gpt-4o or o1-preview for reasoning

@dataclass
class RoutingDecision:
    complexity: QueryComplexity
    model: str
    rationale: str

class ModelRouter:
    # Signals that suggest higher complexity
    COMPLEX_SIGNALS = [
        "analyse", "compare", "synthesise", "evaluate", "reason",
        "explain why", "what are the implications", "trade-off",
        "multi-step", "infer", "hypothesis"
    ]

    SIMPLE_SIGNALS = [
        "what is", "define", "list", "when", "who", "where",
        "how many", "summarise this", "translate", "format"
    ]

    def route(self, query: str, context_length: int = 0) -> RoutingDecision:
        query_lower = query.lower()

        complex_score = sum(1 for s in self.COMPLEX_SIGNALS if s in query_lower)
        simple_score = sum(1 for s in self.SIMPLE_SIGNALS if s in query_lower)

        # Long context = more likely to need powerful model
        if context_length > 8000 or complex_score >= 2:
            return RoutingDecision(
                complexity=QueryComplexity.COMPLEX,
                model="gpt-4o",
                rationale=f"Complex signals: {complex_score}, context: {context_length}"
            )
        elif simple_score >= 2 and complex_score == 0 and context_length < 2000:
            return RoutingDecision(
                complexity=QueryComplexity.SIMPLE,
                model="gpt-4o-mini",
                rationale=f"Simple signals: {simple_score}"
            )
        else:
            return RoutingDecision(
                complexity=QueryComplexity.MODERATE,
                model="gpt-4o-mini",  # Default to cheaper model for moderate
                rationale="Moderate complexity"
            )

def routed_llm_call(query: str, context: str, system_prompt: str) -> dict:
    router = ModelRouter()
    decision = router.route(query, len(context))

    response = client.chat.completions.create(
        model=decision.model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"{context}\n\nQuery: {query}"}
        ]
    )

    return {
        "response": response.choices[0].message.content,
        "model_used": decision.model,
        "routing_rationale": decision.rationale,
        "tokens_used": response.usage.total_tokens
    }

Expected savings: 30–40% of inference costs for mixed-complexity query distributions.

Pattern 3: Prompt Compression Measurement

Before compressing, measure. The goal is to identify which parts of your prompts are high-information vs redundant.

def measure_prompt_efficiency(
    original_prompt: str,
    compressed_prompt: str,
    test_queries: list[str],
    system_prompt: str
) -> dict:
    """
    Measure: does the compressed prompt produce equivalent outputs?
    Run BEFORE switching to compressed prompt in production.
    """
    original_tokens = len(original_prompt.split()) * 1.3  # rough token estimate
    compressed_tokens = len(compressed_prompt.split()) * 1.3
    token_reduction = (original_tokens - compressed_tokens) / original_tokens

    quality_scores = []
    for query in test_queries[:20]:  # sample evaluation
        orig_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": original_prompt + "\n\n" + query}
            ]
        ).choices[0].message.content

        comp_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": compressed_prompt + "\n\n" + query}
            ]
        ).choices[0].message.content

        # Simple length/structure similarity as proxy — replace with semantic similarity
        # in production evaluation
        length_ratio = len(comp_response) / max(len(orig_response), 1)
        quality_scores.append(min(length_ratio, 1/length_ratio))

    avg_quality = sum(quality_scores) / len(quality_scores)

    return {
        "token_reduction_pct": f"{token_reduction:.1%}",
        "estimated_monthly_savings_at_1M_calls": f"${token_reduction * 1000 * 0.0000025 * 1_000_000:.0f}",
        "avg_quality_retention": f"{avg_quality:.2%}",
        "recommendation": "SAFE TO DEPLOY" if avg_quality > 0.85 else "REVIEW BEFORE DEPLOYING"
    }

Expected savings: 20–30% of token costs with well-compressed prompts. Run evaluation before switching.

Pattern 4: Cost Monitoring Dashboard (Catch Spiral Early)

The most important pattern: know your cost per query, per model, per feature — in real time.

from datetime import datetime
from collections import defaultdict

class AIcostMonitor:
    # Pricing per 1K tokens (update with current pricing)
    MODEL_COSTS = {
        "gpt-4o": {"input": 0.0025, "output": 0.01},
        "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
        "text-embedding-3-small": {"input": 0.00002, "output": 0.0}
    }

    def __init__(self, alert_threshold_daily_usd: float = 100.0):
        self.calls: list[dict] = []
        self.alert_threshold = alert_threshold_daily_usd

    def log_call(self, model: str, input_tokens: int, 
                 output_tokens: int, feature: str):
        pricing = self.MODEL_COSTS.get(model, {"input": 0.003, "output": 0.012})
        cost = (input_tokens/1000 * pricing["input"]) + \
               (output_tokens/1000 * pricing["output"])

        self.calls.append({
            "timestamp": datetime.utcnow().isoformat(),
            "model": model,
            "feature": feature,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost_usd": cost
        })

    def daily_summary(self) -> dict:
        today = datetime.utcnow().date().isoformat()
        today_calls = [c for c in self.calls if c["timestamp"].startswith(today)]

        by_feature = defaultdict(float)
        by_model = defaultdict(float)
        total = sum(c["cost_usd"] for c in today_calls)

        for call in today_calls:
            by_feature[call["feature"]] += call["cost_usd"]
            by_model[call["model"]] += call["cost_usd"]

        alert = total > self.alert_threshold
        return {
            "date": today,
            "total_cost_usd": round(total, 4),
            "alert": alert,
            "alert_message": f"ALERT: Daily cost ${total:.2f} exceeds threshold ${self.alert_threshold}" if alert else None,
            "by_feature": dict(sorted(by_feature.items(), key=lambda x: -x[1])),
            "by_model": dict(sorted(by_model.items(), key=lambda x: -x[1])),
            "projected_monthly": round(total * 30, 2)
        }

The Decision Tree

Query arrives
    │
    ├── Check semantic cache
    │       ├── HIT → return cached response (cost: $0)
    │       └── MISS → continue
    │
    ├── Route by complexity
    │       ├── SIMPLE → gpt-4o-mini (10x cheaper)
    │       ├── MODERATE → gpt-4o-mini
    │       └── COMPLEX → gpt-4o
    │
    ├── Apply compressed prompt template
    │
    ├── Call LLM
    │
    ├── Log to cost monitor
    │
    └── Store in cache for future hits

Implement these 4 patterns in sequence. Start with the monitor (you can't optimise what you can't measure), then caching (biggest immediate impact), then routing, then prompt compression.

What's your current cost per 1,000 queries in production? Curious what numbers people are seeing across different use cases.

Sunil — CEO, Ailoitte. We build cost-optimised AI architectures for funded startups. ailoitte.com