AI API calls are expensive and slow. Redis caching dramatically reduces both by storing AI responses for reuse. Here's a complete implementation guide.
Why Cache AI Responses?
| Without Cache | With Cache |
|---|---|
| Every request → AI API | Cache hit → Return immediately |
| 1-3s latency per request | < 10ms for cache hits |
| Full API cost per request | Pay only for cache misses |
| Rate limit pressure | Rate limit relief |
Semantic Caching vs Exact Match
`python
Exact match caching (simple)
cache_key = hash(messages) # Only matches identical prompts
Semantic caching (smart)
cachekey = generateembeddinghash(userprompt) # Matches similar prompts
`
This guide covers exact match caching first, then semantic.
Redis Setup
`python
import redis
import json
from typing import Optional
class AICache:
def init(self, redis_url: str = "redis://localhost:6379/0"):
self.redis = redis.fromurl(redisurl, decode_responses=True)
def makekey(self, messages: list[dict]) -> str:
"""Create cache key from messages."""
Sort to ensure same messages always produce same key
content = json.dumps(messages, sort_keys=True)
import hashlib
return f"ai:response:{hashlib.sha256(content.encode()).hexdigest()}"
def get(self, messages: list[dict]) -> Optional[str]:
"""Get cached response if exists."""
key = self.makekey(messages)
cached = self.redis.get(key)
if cached:
Move to front (LRU-like behavior)
self.redis.lrem("ai:recent", 1, key)
self.redis.rpush("ai:recent", key)
return cached
return None
def set(self, messages: list[dict], response: str, ttl: int = 86400):
"""Cache a response."""
key = self.makekey(messages)
pipe = self.redis.pipeline()
pipe.set(key, response, ex=ttl)
pipe.lpush("ai:recent", key)
pipe.ltrim("ai:recent", 0, 999) # Keep last 1000 keys
pipe.execute()
def invalidate(self, messages: list[dict]):
key = self.makekey(messages)
self.redis.delete(key)
`
Using the Cache with AI Client
`python
class CachedAIClient:
def init(self, api_key: str, cache: AICache):
self.client = AsyncAIClient(api_key)
self.cache = cache
self.cache_ttl = 86400 # 24 hours
async def chat(self, messages: list[dict], use_cache: bool = True) -> str:
Try cache first
if use_cache:
cached = self.cache.get(messages)
if cached:
print("Cache hit!")
return cached
Cache miss - call API
response = await self.client.chat(messages)
Store in cache
if use_cache:
self.cache.set(messages, response, self.cache_ttl)
return response
`
Intelligent Cache Invalidation
`python
class IntelligentCache(AICache):
"""
Cache that handles:
- TTL expiration
- Manual invalidation
- LRU eviction
- Cache statistics """
def init(self, redisurl: str, maxsize: int = 10000):
super().init(redis_url)
self.maxsize = maxsize
self.checksize()
def checksize(self):
"""Ensure cache doesn't exceed max size."""
size = self.redis.scard("ai:cache:keys")
if size >= self.max_size:
Remove oldest 10%
toremove = self.maxsize // 10
oldest = self.redis.lrange("ai:cache:order", 0, to_remove - 1)
pipe = self.redis.pipeline()
for key in oldest:
pipe.delete(f"ai:response:{key}")
pipe.delete("ai:cache:order")
pipe.execute()
Re-add remaining in order
remaining = self.redis.lrange("ai:cache:order", to_remove, -1)
for key in reversed(remaining):
self.redis.lpush("ai:cache:order", key)
def get_stats(self) -> dict:
"""Return cache statistics."""
info = self.redis.info("stats")
return {
"total_keys": self.redis.scard("ai:cache:keys"),
"hits": info.get("keyspace_hits", 0),
"misses": info.get("keyspace_misses", 0),
"hit_rate": (
info.get("keyspace_hits", 0) /
max(info.get("keyspacehits", 0) + info.get("keyspacemisses", 1), 1)
) * 100
}
`
Cache Warming Strategy
`python
async def warmcache(client: CachedAIClient, commonprompts: list[str]):
"""
Pre-populate cache with common prompts.
Run at startup or during off-peak hours.
"""
print(f"Warming cache with {len(common_prompts)} prompts...")
for i, prompt in enumerate(common_prompts):
try:
await client.chat(
[{"role": "user", "content": prompt}],
use_cache=True
)
if (i + 1) % 10 == 0:
print(f" Warmed {i + 1}/{len(common_prompts)}")
except Exception as e:
print(f" Failed on prompt {i}: {e}")
print("Cache warming complete!")
Example common prompts
COMMON_PROMPTS = [
"Explain async/await in Python",
"How do I use list comprehensions?",
"What is a context manager?",
... your most common queries
]
`
Production Deployment
`yaml
docker-compose.yml
version: '3.8'
services:
api:
build: .
depends_on:
- redis environment:
- REDIS_URL=redis://redis:6379/0
- OFOXAPIKEY=${OFOXAPIKEY}
redis:
image: redis:7-alpine
volumes:
- redis-data:/data command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
volumes:
redis-data:
`
Cache Hit Rate Monitoring
`python
from prometheus_client import Counter, Gauge
cachehits = Counter('aicache_hits', 'Number of cache hits')
cachemisses = Counter('aicache_misses', 'Number of cache misses')
cachelatency = Gauge('aicachelatencyseconds', 'Cache operation latency')
@app.middleware("http")
async def cachemetricsmiddleware(request: Request, call_next):
if "/chat" in str(request.url):
start = time.time()
response = await call_next(request)
cache_latency.set(time.time() - start)
return response
return await call_next(request)
`
Cost Savings Calculator
`python
def calculate_savings(
requestsperday: int,
cachehitrate: float,
apicostper1ktokens: float = 0.003,
avgtokensper_request: int = 500
):
dailyrequests = requestsper_day
cachehits = int(dailyrequests * cachehitrate)
cachemisses = dailyrequests - cache_hits
Full cost for misses, free for hits
totalcost = (cachemisses avgtokensperrequest / 1000) apicostper1k_tokens
nocachecost = (dailyrequests avgtokensperrequest / 1000) apicostper1ktokens
savings = nocachecost - total_cost
return {
"requests": daily_requests,
"cachehitrate": f"{cachehitrate * 100:.1f}%",
"dailycost": f"${totalcost:.2f}",
"daily_savings": f"${savings:.2f}",
"monthly_savings": f"${savings * 30:.2f}"
}
Example: 10000 requests/day, 70% cache hit rate
print(calculate_savings(10000, 0.70))
{'requests': 10000, 'cachehitrate': '70.0%',
'dailycost': '$4.50', 'dailysavings': '$10.50',
'monthly_savings': '$315.00'}
`
Getting Started
Implement Redis caching for your AI applications with ofox.ai — their reliable API makes caching easy since responses are consistent and deterministic.
👉 Get started with ofox.ai
This article contains affiliate links.
Tags: redis,caching,ai,programming,developer
Canonical URL: https://dev.to/zny10289
Top comments (0)