What is RAG and Why It Matters Now
RAG (Retrieval-Augmented Generation) is an architectural pattern that supplements LLM knowledge limitations with external documents. When building production systems with Claude Code, it's an essential technique for leveraging internal documentation, codebases, and real-time data.
The basic flow:
- Split documents into chunks
- Convert each chunk into vector embeddings
- Store in a vector database
- At query time, retrieve similar chunks
- Pass retrieved chunks as context to the LLM
Document Chunking Strategy
from langchain.text_splitter import RecursiveCharacterTextSplitter
def create_chunks(text: str) -> list[str]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=64,
separators=["\n\n", "\n", ".", " ", ""],
)
return splitter.split_text(text)
For code files, splitting by function/class boundaries gives more semantically accurate results:
import ast
def split_python_file(source: str) -> list[dict]:
tree = ast.parse(source)
chunks = []
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
start = node.lineno - 1
end = node.end_lineno
lines = source.split("\n")[start:end]
chunks.append({
"content": "\n".join(lines),
"type": type(node).__name__,
"name": node.name,
})
return chunks
Embedding Model Selection
from openai import AsyncOpenAI
client = AsyncOpenAI()
async def embed_texts(texts: list[str]) -> list[list[float]]:
response = await client.embeddings.create(
model="text-embedding-3-small",
input=texts,
encoding_format="float",
)
return [item.embedding for item in response.data]
For zero-cost local embedding:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/multilingual-e5-small")
def embed_local(texts: list[str]):
prefixed = [f"query: {t}" for t in texts]
return model.encode(prefixed, normalize_embeddings=True)
Hybrid Search with pgvector
async def hybrid_search(conn, query_text, query_embedding, limit=5, semantic_weight=0.7):
rows = await conn.fetch("""
WITH semantic AS (
SELECT id, content, metadata,
1 - (embedding <=> $1::vector) AS sem_score
FROM documents
ORDER BY embedding <=> $1::vector
LIMIT 20
),
keyword AS (
SELECT id, content, metadata,
ts_rank(to_tsvector('english', content),
plainto_tsquery('english', $2)) AS kw_score
FROM documents
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $2)
LIMIT 20
)
SELECT s.id, s.content, s.metadata,
($3 * COALESCE(s.sem_score, 0) + (1-$3) * COALESCE(k.kw_score, 0)) AS score
FROM semantic s
LEFT JOIN keyword k USING (id)
ORDER BY score DESC
LIMIT $4
""", query_embedding, query_text, semantic_weight, limit)
return [dict(row) for row in rows]
Context Injection into LLM
import anthropic
async def rag_query(query: str, retrieved_docs: list[dict]) -> str:
client = anthropic.AsyncAnthropic()
context_parts = []
for i, doc in enumerate(retrieved_docs, 1):
source = doc.get("metadata", {}).get("source", "unknown")
context_parts.append(f"[Document {i}] (source: {source})\n{doc['content']}")
context = "\n\n---\n\n".join(context_parts)
response = await client.messages.create(
model="claude-opus-4-5",
max_tokens=2048,
messages=[{
"role": "user",
"content": f"""Answer based on the reference documents below.
<context>
{context}
</context>
<question>{query}</question>
If information is not in the documents, please state so clearly."""
}],
)
return response.content[0].text
Evaluating RAG Quality
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset
def evaluate_rag(test_cases: list[dict]) -> dict:
dataset = Dataset.from_list(test_cases)
return evaluate(
dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)
RAG systems require continuous monitoring of retrieval logs, score distributions, and user feedback to keep improving chunk strategy and model selection.
This article is from the Claude Code Complete Guide (7 chapters) on note.com.
myouga (@myougatheaxo) - VTuber axolotl. Sharing practical AI development tips.
Top comments (0)