5 AI Agent Memory Patterns That Actually Work (With Python Code)
Every AI agent starts stateless. Each request is a blank slate. That works for simple tasks, but the moment you need context — "what did the user ask yesterday?" or "what files did I already review?" — you need memory.
Here are 5 memory patterns, ordered from simplest to most powerful. Each one includes working code.
Pattern 1: Conversation Buffer (The Default)
Most frameworks give you this for free. Store the full conversation history and pass it back every time.
class BufferMemory:
def __init__(self, max_messages: int = 50):
self.messages: list[dict] = []
self.max_messages = max_messages
def add(self, role: str, content: str):
self.messages.append({"role": role, "content": content})
# Trim from the front when we hit the limit
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
def get_context(self) -> list[dict]:
return self.messages.copy()
When to use: Simple chatbots, single-session tools.
The problem: Token cost scales linearly. At 50 messages, you're burning thousands of tokens per call just on context. And when the session ends, everything disappears.
Pattern 2: Sliding Window + Summary
Keep recent messages verbatim, but compress older ones into a running summary.
import json
class SlidingWindowMemory:
def __init__(self, window_size: int = 10):
self.recent: list[dict] = []
self.summary: str = ""
self.window_size = window_size
def add(self, role: str, content: str):
self.recent.append({"role": role, "content": content})
if len(self.recent) > self.window_size:
overflow = self.recent[:-self.window_size]
self.recent = self.recent[-self.window_size:]
self._update_summary(overflow)
def _update_summary(self, old_messages: list[dict]):
# In production, use an LLM call here
conversation = "\n".join(
f"{m['role']}: {m['content']}" for m in old_messages
)
# Simplified — replace with actual LLM summarization
self.summary += f"\n[Summarized: {len(old_messages)} messages about "
self.summary += f"{old_messages[0]['content'][:50]}...]"
def get_context(self) -> list[dict]:
context = []
if self.summary:
context.append({
"role": "system",
"content": f"Previous conversation summary:\n{self.summary}"
})
context.extend(self.recent)
return context
When to use: Long conversations, customer support agents, coding assistants.
The key insight: Summaries are lossy. You trade precision for cost. The last 10 messages are exact; older ones are compressed. For most agents, this is the right tradeoff.
Pattern 3: File-Based Persistent Memory
The simplest form of persistence. Write memories to disk, load them next session.
import json
from pathlib import Path
from datetime import datetime
class FileMemory:
def __init__(self, memory_dir: str = "./memory"):
self.dir = Path(memory_dir)
self.dir.mkdir(exist_ok=True)
def save_daily(self, content: str):
"""Save a daily memory note."""
today = datetime.now().strftime("%Y-%m-%d")
path = self.dir / f"{today}.md"
with open(path, "a") as f:
timestamp = datetime.now().strftime("%H:%M")
f.write(f"\n## {timestamp}\n{content}\n")
def save_entity(self, entity: str, facts: dict):
"""Save facts about a specific entity (user, project, etc)."""
path = self.dir / "entities.json"
entities = {}
if path.exists():
entities = json.loads(path.read_text())
if entity not in entities:
entities[entity] = {}
entities[entity].update(facts)
entities[entity]["_updated"] = datetime.now().isoformat()
path.write_text(json.dumps(entities, indent=2))
def load_recent(self, days: int = 3) -> str:
"""Load memory from the last N days."""
memories = []
for i in range(days):
date = datetime.now() - timedelta(days=i)
path = self.dir / f"{date.strftime('%Y-%m-%d')}.md"
if path.exists():
memories.append(path.read_text())
return "\n---\n".join(memories)
def search(self, keyword: str) -> list[str]:
"""Simple keyword search across all memory files."""
results = []
for path in self.dir.glob("*.md"):
content = path.read_text()
if keyword.lower() in content.lower():
results.append(f"**{path.name}:**\n{content}")
return results
When to use: Personal assistants, project-specific agents, any agent that needs to "remember" across sessions.
Why this works: Files are debuggable. You can read them, edit them, version-control them. No database setup, no dependencies. This is how many production AI assistants actually work.
Pattern 4: Structured Memory with Decay
Not all memories are equal. Recent ones matter more. Frequently-accessed ones matter more. This pattern adds relevance scoring.
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import math
import json
@dataclass
class Memory:
content: str
created: datetime = field(default_factory=datetime.now)
last_accessed: datetime = field(default_factory=datetime.now)
access_count: int = 0
importance: float = 0.5 # 0-1 scale
def relevance_score(self) -> float:
"""Combine recency, frequency, and importance."""
hours_since_access = (
datetime.now() - self.last_accessed
).total_seconds() / 3600
# Exponential decay: half-life of 24 hours
recency = math.exp(-0.693 * hours_since_access / 24)
# Log-scale frequency boost
frequency = math.log(self.access_count + 1) / 5
# Weighted combination
return (0.4 * recency) + (0.2 * frequency) + (0.4 * self.importance)
class DecayMemory:
def __init__(self, max_memories: int = 1000):
self.memories: list[Memory] = []
self.max_memories = max_memories
def add(self, content: str, importance: float = 0.5):
mem = Memory(content=content, importance=importance)
self.memories.append(mem)
self._prune()
def recall(self, top_k: int = 10) -> list[str]:
"""Return the most relevant memories."""
scored = sorted(
self.memories,
key=lambda m: m.relevance_score(),
reverse=True
)[:top_k]
# Mark as accessed
for m in scored:
m.last_accessed = datetime.now()
m.access_count += 1
return [m.content for m in scored]
def _prune(self):
"""Remove lowest-relevance memories when over capacity."""
if len(self.memories) > self.max_memories:
self.memories.sort(key=lambda m: m.relevance_score())
self.memories = self.memories[-self.max_memories:]
When to use: Long-running agents that accumulate thousands of memories. Agents that need to "forget" gracefully.
The math: The relevance score combines three signals — recency (exponential decay with 24h half-life), access frequency (log-scaled), and importance (set when memory is created). Tweak the weights for your use case.
Pattern 5: Vector-Backed Semantic Memory
When keyword search isn't enough. Store memories as embeddings, retrieve by meaning.
import numpy as np
from openai import OpenAI
class VectorMemory:
def __init__(self):
self.client = OpenAI()
self.memories: list[dict] = [] # {text, embedding, metadata}
def _embed(self, text: str) -> list[float]:
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def add(self, text: str, metadata: dict = None):
embedding = self._embed(text)
self.memories.append({
"text": text,
"embedding": embedding,
"metadata": metadata or {},
"created": datetime.now().isoformat(),
})
def search(self, query: str, top_k: int = 5) -> list[str]:
"""Find memories most similar to the query."""
query_embedding = self._embed(query)
scored = []
for mem in self.memories:
similarity = self._cosine_similarity(
query_embedding, mem["embedding"]
)
scored.append((similarity, mem["text"]))
scored.sort(reverse=True)
return [text for _, text in scored[:top_k]]
@staticmethod
def _cosine_similarity(a: list[float], b: list[float]) -> float:
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
When to use: Knowledge bases, RAG systems, agents that need to find relevant info across thousands of unstructured memories.
Production tip: For anything beyond 10K memories, use a vector database (Chroma, Pinecone, pgvector) instead of in-memory cosine similarity. The pattern stays the same — only the storage layer changes.
Choosing the Right Pattern
| Pattern | Persistence | Token Cost | Complexity | Best For |
|---|---|---|---|---|
| Buffer | None | High (linear) | Trivial | Quick prototypes |
| Sliding Window | None | Medium (capped) | Low | Long conversations |
| File-Based | Disk | Low (load on demand) | Low | Personal assistants |
| Structured Decay | Disk/DB | Low (top-K only) | Medium | Long-running agents |
| Vector-Backed | DB | Low (top-K only) | Higher | Knowledge-heavy agents |
Start with Pattern 3 (file-based). It's simple, debuggable, and handles most use cases. Add vector search only when keyword matching fails.
The Hybrid Approach
Production agents typically combine patterns:
class HybridMemory:
def __init__(self):
self.buffer = SlidingWindowMemory(window_size=10)
self.files = FileMemory("./memory")
self.vectors = VectorMemory()
def get_context(self, query: str = None) -> list[dict]:
context = []
# Layer 1: Recent conversation (always included)
context.extend(self.buffer.get_context())
# Layer 2: Today's notes (file-based)
daily = self.files.load_recent(days=1)
if daily:
context.append({
"role": "system",
"content": f"Today's notes:\n{daily}"
})
# Layer 3: Semantic recall (if we have a query)
if query:
relevant = self.vectors.search(query, top_k=3)
if relevant:
context.append({
"role": "system",
"content": f"Relevant memories:\n" +
"\n".join(f"- {m}" for m in relevant)
})
return context
Short-term buffer for the current conversation. Files for structured daily/entity data. Vectors for semantic search across everything.
Key Takeaways
- Stateless is a bug, not a feature — agents without memory repeat mistakes and lose context
- Files beat databases for most agent memory needs — simpler, more debuggable
- Decay matters — not all memories deserve equal weight
- Hybrid wins — combine patterns for different time horizons
- Start simple — you can always add vector search later
This is part of the "AI Engineering in Practice" series — practical guides for developers building with AI. Follow for more.
Top comments (0)