Target Keyword: "redis caching ai applications"
Tags: redis,caching,ai,programming,developer
Type: Tutorial
Content
Redis Caching for AI Applications: Reducing Latency and Cost
AI API calls are expensive and slow. Redis caching dramatically reduces both by storing AI responses for reuse. Here's a complete implementation guide.
Why Cache AI Responses?
| Without Cache | With Cache |
|---|---|
| Every request β AI API | Cache hit β Return immediately |
| 1-3s latency per request | < 10ms for cache hits |
| Full API cost per request | Pay only for cache misses |
| Rate limit pressure | Rate limit relief |
Semantic Caching vs Exact Match
# Exact match caching (simple)
cache_key = hash(messages) # Only matches identical prompts
# Semantic caching (smart)
cache_key = generate_embedding_hash(user_prompt) # Matches similar prompts
This guide covers exact match caching first, then semantic.
Redis Setup
import redis
import json
from typing import Optional
class AICache:
def __init__(self, redis_url: str = "redis://localhost:6379/0"):
self.redis = redis.from_url(redis_url, decode_responses=True)
def _make_key(self, messages: list[dict]) -> str:
"""Create cache key from messages."""
# Sort to ensure same messages always produce same key
content = json.dumps(messages, sort_keys=True)
import hashlib
return f"ai:response:{hashlib.sha256(content.encode()).hexdigest()}"
def get(self, messages: list[dict]) -> Optional[str]:
"""Get cached response if exists."""
key = self._make_key(messages)
cached = self.redis.get(key)
if cached:
# Move to front (LRU-like behavior)
self.redis.lrem("ai:recent", 1, key)
self.redis.rpush("ai:recent", key)
return cached
return None
def set(self, messages: list[dict], response: str, ttl: int = 86400):
"""Cache a response."""
key = self._make_key(messages)
pipe = self.redis.pipeline()
pipe.set(key, response, ex=ttl)
pipe.lpush("ai:recent", key)
pipe.ltrim("ai:recent", 0, 999) # Keep last 1000 keys
pipe.execute()
def invalidate(self, messages: list[dict]):
key = self._make_key(messages)
self.redis.delete(key)
Using the Cache with AI Client
class CachedAIClient:
def __init__(self, api_key: str, cache: AICache):
self.client = AsyncAIClient(api_key)
self.cache = cache
self.cache_ttl = 86400 # 24 hours
async def chat(self, messages: list[dict], use_cache: bool = True) -> str:
# Try cache first
if use_cache:
cached = self.cache.get(messages)
if cached:
print("Cache hit!")
return cached
# Cache miss - call API
response = await self.client.chat(messages)
# Store in cache
if use_cache:
self.cache.set(messages, response, self.cache_ttl)
return response
Intelligent Cache Invalidation
class IntelligentCache(AICache):
"""
Cache that handles:
1. TTL expiration
2. Manual invalidation
3. LRU eviction
4. Cache statistics
"""
def __init__(self, redis_url: str, max_size: int = 10000):
super().__init__(redis_url)
self.max_size = max_size
self._check_size()
def _check_size(self):
"""Ensure cache doesn't exceed max size."""
size = self.redis.scard("ai:cache:keys")
if size >= self.max_size:
# Remove oldest 10%
to_remove = self.max_size // 10
oldest = self.redis.lrange("ai:cache:order", 0, to_remove - 1)
pipe = self.redis.pipeline()
for key in oldest:
pipe.delete(f"ai:response:{key}")
pipe.delete("ai:cache:order")
pipe.execute()
# Re-add remaining in order
remaining = self.redis.lrange("ai:cache:order", to_remove, -1)
for key in reversed(remaining):
self.redis.lpush("ai:cache:order", key)
def get_stats(self) -> dict:
"""Return cache statistics."""
info = self.redis.info("stats")
return {
"total_keys": self.redis.scard("ai:cache:keys"),
"hits": info.get("keyspace_hits", 0),
"misses": info.get("keyspace_misses", 0),
"hit_rate": (
info.get("keyspace_hits", 0) /
max(info.get("keyspace_hits", 0) + info.get("keyspace_misses", 1), 1)
) * 100
}
Cache Warming Strategy
async def warm_cache(client: CachedAIClient, common_prompts: list[str]):
"""
Pre-populate cache with common prompts.
Run at startup or during off-peak hours.
"""
print(f"Warming cache with {len(common_prompts)} prompts...")
for i, prompt in enumerate(common_prompts):
try:
await client.chat(
[{"role": "user", "content": prompt}],
use_cache=True
)
if (i + 1) % 10 == 0:
print(f" Warmed {i + 1}/{len(common_prompts)}")
except Exception as e:
print(f" Failed on prompt {i}: {e}")
print("Cache warming complete!")
# Example common prompts
COMMON_PROMPTS = [
"Explain async/await in Python",
"How do I use list comprehensions?",
"What is a context manager?",
# ... your most common queries
]
Production Deployment
# docker-compose.yml
version: '3.8'
services:
api:
build: .
depends_on:
- redis
environment:
- REDIS_URL=redis://redis:6379/0
- OFOX_API_KEY=${OFOX_API_KEY}
redis:
image: redis:7-alpine
volumes:
- redis-data:/data
command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
volumes:
redis-data:
Cache Hit Rate Monitoring
from prometheus_client import Counter, Gauge
cache_hits = Counter('ai_cache_hits', 'Number of cache hits')
cache_misses = Counter('ai_cache_misses', 'Number of cache misses')
cache_latency = Gauge('ai_cache_latency_seconds', 'Cache operation latency')
@app.middleware("http")
async def cache_metrics_middleware(request: Request, call_next):
if "/chat" in str(request.url):
start = time.time()
response = await call_next(request)
cache_latency.set(time.time() - start)
return response
return await call_next(request)
Cost Savings Calculator
def calculate_savings(
requests_per_day: int,
cache_hit_rate: float,
api_cost_per_1k_tokens: float = 0.003,
avg_tokens_per_request: int = 500
):
daily_requests = requests_per_day
cache_hits = int(daily_requests * cache_hit_rate)
cache_misses = daily_requests - cache_hits
# Full cost for misses, free for hits
total_cost = (cache_misses * avg_tokens_per_request / 1000) * api_cost_per_1k_tokens
no_cache_cost = (daily_requests * avg_tokens_per_request / 1000) * api_cost_per_1k_tokens
savings = no_cache_cost - total_cost
return {
"requests": daily_requests,
"cache_hit_rate": f"{cache_hit_rate * 100:.1f}%",
"daily_cost": f"${total_cost:.2f}",
"daily_savings": f"${savings:.2f}",
"monthly_savings": f"${savings * 30:.2f}"
}
# Example: 10000 requests/day, 70% cache hit rate
print(calculate_savings(10000, 0.70))
# {'requests': 10000, 'cache_hit_rate': '70.0%',
# 'daily_cost': '$4.50', 'daily_savings': '$10.50',
# 'monthly_savings': '$315.00'}
Getting Started
Implement Redis caching for your AI applications with ofox.ai β their reliable API makes caching easy since responses are consistent and deterministic.
This article contains affiliate links.
Tags: redis,caching,ai,programming,developer
Canonical URL: https://dev.to/zny10289
Top comments (0)