Give Your AI Agent Long-Term Memory with SQLite and Ollama
Every local LLM has the same problem: it forgets everything when the session ends.
You build a useful AI assistant, have a great conversation, close the terminal — and next time you open it, the model has no idea who you are or what you discussed. The context window is a goldfish bowl.
The fix is a memory layer: a persistent store that sits between your agent and the LLM, injecting relevant history into each new conversation. Here's how to build one with Python, SQLite, and Ollama — running entirely on local hardware.
What We're Building
An AI agent that:
- Remembers past conversations in SQLite
- Summarizes old sessions to compress memory efficiently
- Retrieves relevant past context before each response
- Updates memory after each exchange
- Runs 100% locally via Ollama
Prerequisites
- Python 3.10+
- Ollama installed with
qwen2.5:14bpulled (setup guide) - SQLite (built into Python — no install needed)
pip install ollama
That's it. No external dependencies beyond Ollama.
Step 1: Memory Schema
SQLite gives us a fast, local, zero-config database. We'll use two tables: raw conversation history and compressed summaries.
import sqlite3
from pathlib import Path
from datetime import datetime
DB_PATH = Path.home() / ".agent_memory.db"
def init_db(db_path: Path = DB_PATH) -> sqlite3.Connection:
"""Initialize the memory database."""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row # Dict-like row access
conn.executescript("""
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT NOT NULL,
role TEXT NOT NULL, -- 'user' or 'assistant'
content TEXT NOT NULL,
timestamp TEXT NOT NULL,
tokens_est INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS summaries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT NOT NULL,
summary TEXT NOT NULL,
turn_range TEXT NOT NULL, -- e.g. "turns 1-20"
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS facts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
key TEXT UNIQUE NOT NULL, -- e.g. "user_name", "preferred_language"
value TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_conv_session
ON conversations(session_id, timestamp);
""")
conn.commit()
return conn
def save_turn(conn: sqlite3.Connection, session_id: str,
role: str, content: str):
"""Save a single conversation turn."""
conn.execute(
"INSERT INTO conversations (session_id, role, content, timestamp, tokens_est) "
"VALUES (?, ?, ?, ?, ?)",
(session_id, role, content, datetime.utcnow().isoformat(),
len(content.split())) # Rough token estimate
)
conn.commit()
def get_recent_turns(conn: sqlite3.Connection, session_id: str,
limit: int = 20) -> list[dict]:
"""Fetch recent turns for the current session."""
rows = conn.execute(
"SELECT role, content, timestamp FROM conversations "
"WHERE session_id = ? ORDER BY id DESC LIMIT ?",
(session_id, limit)
).fetchall()
return [dict(r) for r in reversed(rows)]
Step 2: Cross-Session Memory Retrieval
The key insight: don't just replay the last session verbatim — that wastes context window. Instead, retrieve summaries from old sessions and key facts the agent has learned.
def get_memory_context(conn: sqlite3.Connection,
current_session: str,
max_summaries: int = 3) -> str:
"""
Build a memory context block to inject before each response.
Includes: key facts + recent session summaries.
"""
sections = []
# 1. Key facts (things explicitly remembered)
facts = conn.execute(
"SELECT key, value FROM facts ORDER BY updated_at DESC LIMIT 20"
).fetchall()
if facts:
facts_text = "\n".join(f"- {r['key']}: {r['value']}" for r in facts)
sections.append(f"KNOWN FACTS ABOUT THE USER:\n{facts_text}")
# 2. Summaries from past sessions (not current)
summaries = conn.execute(
"SELECT session_id, summary, created_at FROM summaries "
"WHERE session_id != ? ORDER BY created_at DESC LIMIT ?",
(current_session, max_summaries)
).fetchall()
if summaries:
summary_text = "\n\n".join(
f"[Session {r['session_id'][:8]}... on {r['created_at'][:10]}]\n{r['summary']}"
for r in summaries
)
sections.append(f"PAST CONVERSATION SUMMARIES:\n{summary_text}")
if not sections:
return ""
return "=== MEMORY ===\n" + "\n\n".join(sections) + "\n=== END MEMORY ===\n\n"
def upsert_fact(conn: sqlite3.Connection, key: str, value: str):
"""Store or update a key fact the agent has learned."""
conn.execute(
"INSERT INTO facts (key, value, updated_at) VALUES (?, ?, ?) "
"ON CONFLICT(key) DO UPDATE SET value=excluded.value, updated_at=excluded.updated_at",
(key, value, datetime.utcnow().isoformat())
)
conn.commit()
Step 3: Session Summarization
When a session ends (or gets too long), compress it into a summary. This keeps memory compact while preserving meaning.
import ollama
def summarize_session(conn: sqlite3.Connection, session_id: str,
model: str = "qwen2.5:14b") -> str | None:
"""
Summarize a completed session and store it.
Returns summary text or None if nothing to summarize.
"""
turns = conn.execute(
"SELECT role, content FROM conversations "
"WHERE session_id = ? ORDER BY id",
(session_id,)
).fetchall()
if len(turns) < 4:
return None # Too short to summarize
# Format conversation for summarization
convo = "\n".join(f"{r['role'].upper()}: {r['content']}" for r in turns)
prompt = f"""Summarize this conversation concisely. Focus on:
1. What the user was trying to accomplish
2. Key decisions or solutions reached
3. Any personal details mentioned (name, preferences, context)
4. Unresolved items or follow-ups needed
Keep it under 200 words.
CONVERSATION:
{convo}
SUMMARY:"""
response = ollama.generate(model=model, prompt=prompt,
options={"temperature": 0.2})
summary = response['response'].strip()
turn_count = len(turns)
conn.execute(
"INSERT INTO summaries (session_id, summary, turn_range, created_at) "
"VALUES (?, ?, ?, ?)",
(session_id, summary, f"turns 1-{turn_count}",
datetime.utcnow().isoformat())
)
conn.commit()
print(f"[memory] Session {session_id[:8]} summarized ({turn_count} turns → {len(summary)} chars)")
return summary
Step 4: The Agent
Now wire it all together into a stateful agent:
import uuid
SYSTEM_PROMPT = """You are a helpful, persistent AI assistant with memory across sessions.
You have access to summaries of past conversations and key facts about the user.
Use this context naturally — don't announce that you're using memory, just apply it.
If the user tells you something important about themselves, remember it.
Be concise, direct, and useful."""
class MemoryAgent:
def __init__(self, session_id: str = None, model: str = "qwen2.5:14b",
db_path: Path = DB_PATH):
self.model = model
self.session_id = session_id or str(uuid.uuid4())
self.conn = init_db(db_path)
self.turn_count = 0
print(f"[agent] Session: {self.session_id[:8]}...")
def chat(self, user_message: str) -> str:
"""Process a user message and return a response."""
# Save user turn
save_turn(self.conn, self.session_id, "user", user_message)
# Build context: memory + recent conversation history
memory_context = get_memory_context(self.conn, self.session_id)
recent = get_recent_turns(self.conn, self.session_id, limit=10)
# Format messages for the LLM
messages = [
{"role": "system", "content": SYSTEM_PROMPT + "\n\n" + memory_context}
]
for turn in recent[:-1]: # Exclude the turn we just saved
messages.append({"role": turn["role"], "content": turn["content"]})
messages.append({"role": "user", "content": user_message})
# Generate response
response = ollama.chat(model=self.model, messages=messages,
options={"temperature": 0.7})
assistant_reply = response['message']['content'].strip()
# Save assistant turn
save_turn(self.conn, self.session_id, "assistant", assistant_reply)
self.turn_count += 1
# Auto-summarize every 30 turns to keep context manageable
if self.turn_count % 30 == 0:
summarize_session(self.conn, self.session_id, model=self.model)
return assistant_reply
def remember(self, key: str, value: str):
"""Explicitly store a fact for future sessions."""
upsert_fact(self.conn, key, value)
print(f"[memory] Stored: {key} = {value}")
def end_session(self):
"""Summarize and close the session."""
summarize_session(self.conn, self.session_id, model=self.model)
self.conn.close()
print(f"[agent] Session {self.session_id[:8]} saved to memory.")
# --- CLI Interface ---
if __name__ == "__main__":
import sys
# Optionally resume a session by passing session_id as arg
session_id = sys.argv[1] if len(sys.argv) > 1 else None
agent = MemoryAgent(session_id=session_id)
print("Memory agent ready. Type 'quit' to end, 'remember KEY=VALUE' to store facts.\n")
try:
while True:
user_input = input("You: ").strip()
if not user_input:
continue
if user_input.lower() in ('quit', 'exit', 'q'):
break
if user_input.lower().startswith("remember ") and "=" in user_input:
_, kv = user_input.split(" ", 1)
k, v = kv.split("=", 1)
agent.remember(k.strip(), v.strip())
continue
reply = agent.chat(user_input)
print(f"\nAgent: {reply}\n")
except KeyboardInterrupt:
pass
finally:
agent.end_session()
Usage
# Start a new session
python memory_agent.py
# Resume a specific session (pass the session ID)
python memory_agent.py 3f7a1b2c
# Example interaction:
# You: My name is Alex and I'm building a Rust CLI tool
# Agent: Nice to meet you, Alex! What are you working on with the Rust CLI?
# You: remember user_name=Alex
# [memory] Stored: user_name = Alex
# You: quit
# [memory] Session 3f7a1b2c summarized (6 turns → 142 chars)
# [agent] Session 3f7a1b2c saved to memory.
# Next session — agent remembers Alex
# python memory_agent.py
# You: Hey, what were we working on?
# Agent: Welcome back, Alex! Last time you were building a Rust CLI tool...
Memory Architecture Summary
Session Start
↓
Load facts + past summaries → inject into system prompt
↓
User Message → save to SQLite → build messages → Ollama → response
↓
Save response to SQLite
↓
Every 30 turns: auto-summarize → compress into summaries table
↓
Session End: final summarize → close DB
Extending This
A few natural next steps:
-
Semantic retrieval: instead of fetching summaries by recency, embed them with
nomic-embed-textand retrieve by relevance (see my RAG guide) -
Fact extraction: have the LLM automatically extract facts from conversation and call
upsert_fact()without user prompting -
Multiple agents: each agent gets its own
session_idnamespace — they don't share memory unless you want them to - Web UI: wrap the agent in a FastAPI endpoint + simple HTML chat interface
The SQLite file lives at ~/.agent_memory.db — back it up and your agent's memory travels with it.
This is part of a local AI stack series:
Top comments (0)