Serhii Kalyna

Posted on May 13 • Originally published at kalyna.pro

How to Build a RAG Chatbot with Python

#rag #python #claudeapi #ai

A RAG (Retrieval-Augmented Generation) chatbot answers questions based on your own documents — not just its training data. This guide builds one from scratch using Python, ChromaDB, and Claude.

Originally published at kalyna.pro

What Is RAG?

RAG combines two things:

Retrieval: search your documents for relevant chunks
Generation: use an LLM to write an answer based on those chunks

Without RAG, Claude can only answer questions based on its training data. With RAG, you inject relevant context directly into the prompt.

Architecture

Indexing: load docs → split into chunks → embed → store in vector DB
Querying: embed question → find similar chunks → send to Claude → return answer

Step 1: Install Dependencies

pip install anthropic chromadb sentence-transformers pypdf2

Step 2: Load and Chunk Documents

import PyPDF2
from pathlib import Path


def load_pdf(path: str) -> str:
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join(page.extract_text() for page in reader.pages)


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i : i + chunk_size]))
        i += chunk_size - overlap
    return chunks

Step 3: Build the Vector Index

import chromadb
from chromadb.utils import embedding_functions

embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)
client = chromadb.PersistentClient(path="./chroma_db")


def build_index(documents: list[dict], collection_name: str = "docs"):
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embed_fn,
    )
    texts, ids, metadatas = [], [], []
    for i, doc in enumerate(documents):
        for j, chunk in enumerate(chunk_text(doc["text"])):
            texts.append(chunk)
            ids.append(f"{doc['source']}_chunk_{j}")
            metadatas.append({"source": doc["source"]})

    collection.add(documents=texts, ids=ids, metadatas=metadatas)
    return collection

Step 4: Query + Generate Answer

def retrieve(question: str, collection, n_results: int = 5) -> list[str]:
    return collection.query(query_texts=[question], n_results=n_results)["documents"][0]


def generate_answer(question: str, chunks: list[str]) -> str:
    context = "\n\n---\n\n".join(chunks)
    response = claude.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=1024,
        messages=[{
            "role": "user",
            "content": f"Answer using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}"
        }],
    )
    return response.content[0].text

Step 5: Full Chatbot Script

import anthropic
import chromadb
from chromadb.utils import embedding_functions
import PyPDF2
from pathlib import Path

embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
chroma = chromadb.PersistentClient(path="./chroma_db")
claude = anthropic.Anthropic()


def index_files(file_paths: list[str]) -> None:
    collection = chroma.get_or_create_collection("docs", embedding_function=embed_fn)
    texts, ids, metas = [], [], []
    for path in file_paths:
        p = Path(path)
        if p.suffix == ".pdf":
            with open(p, "rb") as f:
                text = "\n".join(page.extract_text() for page in PyPDF2.PdfReader(f).pages)
        else:
            text = p.read_text()
        for j, chunk in enumerate(chunk_text(text)):
            texts.append(chunk); ids.append(f"{p.name}_{j}"); metas.append({"source": path})
    collection.add(documents=texts, ids=ids, metadatas=metas)
    print(f"Indexed {len(texts)} chunks")


def ask(question: str) -> str:
    collection = chroma.get_collection("docs", embedding_function=embed_fn)
    chunks = collection.query(query_texts=[question], n_results=5)["documents"][0]
    context = "\n\n---\n\n".join(chunks)
    response = claude.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=1024,
        messages=[{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}],
    )
    return response.content[0].text


if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1:
        index_files(sys.argv[1:])
    else:
        print("RAG Chatbot ready. Type 'quit' to exit.")
        while True:
            q = input("You: ").strip()
            if q.lower() in ("quit", "exit"): break
            print("Bot:", ask(q), "\n")

Usage

# Index documents
python chatbot.py company_handbook.pdf product_docs.txt

# Start chatting
python chatbot.py

Performance Tips

Chunk size: 300–600 words is a good starting range
Increase n_results for complex questions
Metadata filtering: where={'source': 'specific.pdf'} to restrict search
Reranking: use a cross-encoder reranker for production quality
Caching: cache embeddings to speed up re-indexing

Summary

You built a RAG chatbot that:

Loads PDF and text documents
Splits into overlapping chunks
Embeds and stores in ChromaDB
Retrieves relevant chunks per question
Generates grounded answers with Claude

DEV Community