DEV.TO ARTICLE 43: Redis Caching for AI Applications: Reducing Latency and Cost

Target Keyword: "redis caching ai applications"
Tags: redis,caching,ai,programming,developer
Type: Tutorial

Content

Redis Caching for AI Applications: Reducing Latency and Cost

AI API calls are expensive and slow. Redis caching dramatically reduces both by storing AI responses for reuse. Here's a complete implementation guide.

Why Cache AI Responses?

Without Cache	With Cache
Every request → AI API	Cache hit → Return immediately
1-3s latency per request	< 10ms for cache hits
Full API cost per request	Pay only for cache misses
Rate limit pressure	Rate limit relief

Semantic Caching vs Exact Match

# Exact match caching (simple)
cache_key = hash(messages)  # Only matches identical prompts

# Semantic caching (smart)
cache_key = generate_embedding_hash(user_prompt)  # Matches similar prompts

This guide covers exact match caching first, then semantic.

Redis Setup

import redis
import json
from typing import Optional

class AICache:
    def __init__(self, redis_url: str = "redis://localhost:6379/0"):
        self.redis = redis.from_url(redis_url, decode_responses=True)

    def _make_key(self, messages: list[dict]) -> str:
        """Create cache key from messages."""
        # Sort to ensure same messages always produce same key
        content = json.dumps(messages, sort_keys=True)
        import hashlib
        return f"ai:response:{hashlib.sha256(content.encode()).hexdigest()}"

    def get(self, messages: list[dict]) -> Optional[str]:
        """Get cached response if exists."""
        key = self._make_key(messages)
        cached = self.redis.get(key)
        if cached:
            # Move to front (LRU-like behavior)
            self.redis.lrem("ai:recent", 1, key)
            self.redis.rpush("ai:recent", key)
            return cached
        return None

    def set(self, messages: list[dict], response: str, ttl: int = 86400):
        """Cache a response."""
        key = self._make_key(messages)
        pipe = self.redis.pipeline()
        pipe.set(key, response, ex=ttl)
        pipe.lpush("ai:recent", key)
        pipe.ltrim("ai:recent", 0, 999)  # Keep last 1000 keys
        pipe.execute()

    def invalidate(self, messages: list[dict]):
        key = self._make_key(messages)
        self.redis.delete(key)

Using the Cache with AI Client

class CachedAIClient:
    def __init__(self, api_key: str, cache: AICache):
        self.client = AsyncAIClient(api_key)
        self.cache = cache
        self.cache_ttl = 86400  # 24 hours

    async def chat(self, messages: list[dict], use_cache: bool = True) -> str:
        # Try cache first
        if use_cache:
            cached = self.cache.get(messages)
            if cached:
                print("Cache hit!")
                return cached

        # Cache miss - call API
        response = await self.client.chat(messages)

        # Store in cache
        if use_cache:
            self.cache.set(messages, response, self.cache_ttl)

        return response

Intelligent Cache Invalidation

class IntelligentCache(AICache):
    """
    Cache that handles:
    1. TTL expiration
    2. Manual invalidation
    3. LRU eviction
    4. Cache statistics
    """

    def __init__(self, redis_url: str, max_size: int = 10000):
        super().__init__(redis_url)
        self.max_size = max_size
        self._check_size()

    def _check_size(self):
        """Ensure cache doesn't exceed max size."""
        size = self.redis.scard("ai:cache:keys")
        if size >= self.max_size:
            # Remove oldest 10%
            to_remove = self.max_size // 10
            oldest = self.redis.lrange("ai:cache:order", 0, to_remove - 1)
            pipe = self.redis.pipeline()
            for key in oldest:
                pipe.delete(f"ai:response:{key}")
            pipe.delete("ai:cache:order")
            pipe.execute()
            # Re-add remaining in order
            remaining = self.redis.lrange("ai:cache:order", to_remove, -1)
            for key in reversed(remaining):
                self.redis.lpush("ai:cache:order", key)

    def get_stats(self) -> dict:
        """Return cache statistics."""
        info = self.redis.info("stats")
        return {
            "total_keys": self.redis.scard("ai:cache:keys"),
            "hits": info.get("keyspace_hits", 0),
            "misses": info.get("keyspace_misses", 0),
            "hit_rate": (
                info.get("keyspace_hits", 0) / 
                max(info.get("keyspace_hits", 0) + info.get("keyspace_misses", 1), 1)
            ) * 100
        }

Cache Warming Strategy

async def warm_cache(client: CachedAIClient, common_prompts: list[str]):
    """
    Pre-populate cache with common prompts.
    Run at startup or during off-peak hours.
    """
    print(f"Warming cache with {len(common_prompts)} prompts...")

    for i, prompt in enumerate(common_prompts):
        try:
            await client.chat(
                [{"role": "user", "content": prompt}],
                use_cache=True
            )
            if (i + 1) % 10 == 0:
                print(f"  Warmed {i + 1}/{len(common_prompts)}")
        except Exception as e:
            print(f"  Failed on prompt {i}: {e}")

    print("Cache warming complete!")

# Example common prompts
COMMON_PROMPTS = [
    "Explain async/await in Python",
    "How do I use list comprehensions?",
    "What is a context manager?",
    # ... your most common queries
]

Production Deployment

# docker-compose.yml
version: '3.8'
services:
  api:
    build: .
    depends_on:
      - redis
    environment:
      - REDIS_URL=redis://redis:6379/0
      - OFOX_API_KEY=${OFOX_API_KEY}

  redis:
    image: redis:7-alpine
    volumes:
      - redis-data:/data
    command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru

volumes:
  redis-data:

Cache Hit Rate Monitoring

from prometheus_client import Counter, Gauge

cache_hits = Counter('ai_cache_hits', 'Number of cache hits')
cache_misses = Counter('ai_cache_misses', 'Number of cache misses')
cache_latency = Gauge('ai_cache_latency_seconds', 'Cache operation latency')

@app.middleware("http")
async def cache_metrics_middleware(request: Request, call_next):
    if "/chat" in str(request.url):
        start = time.time()
        response = await call_next(request)
        cache_latency.set(time.time() - start)
        return response
    return await call_next(request)

Cost Savings Calculator

def calculate_savings(
    requests_per_day: int,
    cache_hit_rate: float,
    api_cost_per_1k_tokens: float = 0.003,
    avg_tokens_per_request: int = 500
):
    daily_requests = requests_per_day
    cache_hits = int(daily_requests * cache_hit_rate)
    cache_misses = daily_requests - cache_hits

    # Full cost for misses, free for hits
    total_cost = (cache_misses * avg_tokens_per_request / 1000) * api_cost_per_1k_tokens

    no_cache_cost = (daily_requests * avg_tokens_per_request / 1000) * api_cost_per_1k_tokens

    savings = no_cache_cost - total_cost

    return {
        "requests": daily_requests,
        "cache_hit_rate": f"{cache_hit_rate * 100:.1f}%",
        "daily_cost": f"${total_cost:.2f}",
        "daily_savings": f"${savings:.2f}",
        "monthly_savings": f"${savings * 30:.2f}"
    }

# Example: 10000 requests/day, 70% cache hit rate
print(calculate_savings(10000, 0.70))
# {'requests': 10000, 'cache_hit_rate': '70.0%', 
#  'daily_cost': '$4.50', 'daily_savings': '$10.50',
#  'monthly_savings': '$315.00'}

Getting Started

Implement Redis caching for your AI applications with ofox.ai — their reliable API makes caching easy since responses are consistent and deterministic.

👉 Get started with ofox.ai

This article contains affiliate links.

Tags: redis,caching,ai,programming,developer
Canonical URL: https://dev.to/zny10289