This article contains affiliate links. I may earn a commission at no extra cost to you.
title: "Build a RAG System from Scratch: Create an AI That Answers Questions About Your Codebase"
published: true
description: "Learn to build a Retrieval-Augmented Generation system that can answer questions about your code using vector embeddings and OpenAI's API"
tags: ai, rag, tutorial, python, openai
cover_image:
Build a RAG System from Scratch: Create an AI That Answers Questions About Your Codebase
Ever wished you had an AI assistant that actually understands your codebase? One that could answer questions like "How does the authentication system work?" or "Where is the database connection configured?" without hallucinating answers?
That's exactly what we're building today. We'll create a Retrieval-Augmented Generation (RAG) system that ingests your code and documentation, then provides accurate, contextual answers based on your actual files.
What is RAG and Why Should You Care?
RAG combines the power of large language models with your own data. Instead of relying solely on the model's training data, RAG retrieves relevant information from your documents first, then uses that context to generate answers. This dramatically reduces hallucinations and ensures responses are grounded in your actual codebase.
The architecture is straightforward:
- Ingest: Convert your code/docs into searchable embeddings
- Store: Save embeddings in a vector database
- Retrieve: Find relevant snippets based on user queries
- Generate: Use retrieved context to create informed answers
Setting Up Your Environment
First, let's install the required dependencies:
pip install openai chromadb langchain tiktoken flask python-dotenv
Create a .env file for your API keys:
OPENAI_API_KEY=your_openai_api_key_here
Building the Document Ingestion Pipeline
Our ingestion pipeline needs to read code files, split them into manageable chunks, and create embeddings. Here's the core implementation:
import os
import tiktoken
from pathlib import Path
from typing import List, Dict
from openai import OpenAI
import chromadb
from dotenv import load_dotenv
load_dotenv()
class CodebaseIngester:
def __init__(self, openai_api_key: str):
self.client = OpenAI(api_key=openai_api_key)
self.chroma_client = chromadb.Client()
self.collection = self.chroma_client.create_collection(
name="codebase",
metadata={"hnsw:space": "cosine"}
)
self.encoding = tiktoken.get_encoding("cl100k_base")
def read_file(self, file_path: Path) -> str:
"""Read file content with error handling"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
# Skip binary files
return ""
def chunk_text(self, text: str, max_tokens: int = 500) -> List[str]:
"""Split text into chunks that fit within token limits"""
tokens = self.encoding.encode(text)
chunks = []
for i in range(0, len(tokens), max_tokens):
chunk_tokens = tokens[i:i + max_tokens]
chunk_text = self.encoding.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks
def create_embedding(self, text: str) -> List[float]:
"""Generate embedding using OpenAI's API"""
response = self.client.embeddings.create(
model="text-embedding-ada-002",
input=text
)
return response.data[0].embedding
def ingest_directory(self, directory_path: str, file_extensions: List[str] = None):
"""Ingest all relevant files from a directory"""
if file_extensions is None:
file_extensions = ['.py', '.js', '.ts', '.md', '.txt', '.yml', '.yaml']
directory = Path(directory_path)
documents = []
metadatas = []
embeddings = []
ids = []
for file_path in directory.rglob('*'):
if file_path.suffix in file_extensions and file_path.is_file():
content = self.read_file(file_path)
if not content.strip():
continue
chunks = self.chunk_text(content)
for i, chunk in enumerate(chunks):
if len(chunk.strip()) < 50: # Skip very small chunks
continue
doc_id = f"{file_path.name}_{i}"
embedding = self.create_embedding(chunk)
documents.append(chunk)
metadatas.append({
"file_path": str(file_path),
"file_name": file_path.name,
"chunk_index": i
})
embeddings.append(embedding)
ids.append(doc_id)
print(f"Processed: {file_path.name} (chunk {i})")
# Batch insert into ChromaDB
self.collection.add(
documents=documents,
metadatas=metadatas,
embeddings=embeddings,
ids=ids
)
print(f"Ingested {len(documents)} chunks from {directory_path}")
Implementing the Retrieval Logic
Now let's build the retrieval component that finds relevant code snippets:
class CodebaseRetriever:
def __init__(self, openai_api_key: str, collection_name: str = "codebase"):
self.client = OpenAI(api_key=openai_api_key)
self.chroma_client = chromadb.Client()
self.collection = self.chroma_client.get_collection(name=collection_name)
def create_query_embedding(self, query: str) -> List[float]:
"""Create embedding for user query"""
response = self.client.embeddings.create(
model="text-embedding-ada-002",
input=query
)
return response.data[0].embedding
def retrieve_relevant_chunks(self, query: str, n_results: int = 5) -> Dict:
"""Retrieve most relevant code chunks for a query"""
query_embedding = self.create_query_embedding(query)
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
include=["documents", "metadatas", "distances"]
)
return {
"documents": results["documents"][0],
"metadatas": results["metadatas"][0],
"distances": results["distances"][0]
}
Creating the Answer Generation Component
Now we'll integrate with OpenAI's API to generate contextual answers:
class RAGSystem:
def __init__(self, openai_api_key: str):
self.client = OpenAI(api_key=openai_api_key)
self.retriever = CodebaseRetriever(openai_api_key)
def generate_answer(self, query: str, max_context_length: int = 3000) -> Dict:
"""Generate answer using retrieved context"""
# Retrieve relevant chunks
retrieved = self.retriever.retrieve_relevant_chunks(query, n_results=5)
# Build context from retrieved chunks
context_parts = []
total_length = 0
for doc, metadata in zip(retrieved["documents"], retrieved["metadatas"]):
if total_length + len(doc) > max_context_length:
break
context_parts.append(f"File: {metadata['file_name']}\n{doc}\n---")
total_length += len(doc)
context = "\n".join(context_parts)
# Create prompt
prompt = f"""You are a helpful assistant that answers questions about a codebase.
Use the following code snippets and documentation to answer the user's question.
If you cannot find relevant information in the provided context, say so.
Context:
{context}
Question: {query}
Answer:"""
# Generate response
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
],
temperature=0.1,
max_tokens=500
)
return {
"answer": response.choices[0].message.content,
"sources": [meta["file_name"] for meta in retrieved["metadatas"]],
"context_used": len(context_parts)
}
Building a Simple Web Interface
Let's create a Flask web interface to interact with our RAG system:
from flask import Flask, render_template, request, jsonify
app = Flask(__name__)
rag_system = RAGSystem(os.getenv("OPENAI_API_KEY"))
@app.route('/')
def index():
return render_template('index.html')
@app.route('/ask', methods=['POST'])
def ask_question():
data = request.get_json()
query = data.get('query', '')
if not query:
return jsonify({'error': 'No query provided'}), 400
try:
result = rag_system.generate_answer(query)
return jsonify(result)
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(debug=True)
Create templates/index.html:
<!DOCTYPE html>
<html>
<head>
<title>Codebase AI Assistant</title>
<style>
body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
.chat-container { border: 1px solid #ddd; height: 400px; overflow-y: auto; padding: 10px; margin: 20px 0; }
.query-input { width: 70%; padding: 10px; }
.ask-button { padding: 10px 20px; background: #007cba; color: white; border: none; cursor: pointer; }
.message { margin: 10px 0; padding: 10px; border-radius: 5px; }
.user-message { background: #e3f2fd; }
.ai-message { background: #f5f5f5; }
.sources { font-size: 0.8em; color: #666; margin-top: 5px; }
</style>
</head>
<body>
<h1>Codebase AI Assistant</h1>
<div class="chat-container" id="chat"></div>
<div>
<input type="text" id="queryInput" class="query-input" placeholder="Ask about your codebase...">
<button onclick="askQuestion()" class="ask-button">Ask</button>
</div>
<script>
function askQuestion() {
const query = document.getElementById('queryInput').value;
if (!query) return;
addMessage(query, 'user');
document.getElementById('queryInput').value = '';
fetch('/ask', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({query: query})
})
.then(response => response.json())
.then(data => {
if (data.error) {
addMessage('Error: ' + data.error, 'ai');
} else {
const sources = data.sources.length > 0 ?
`<div class="sources">Sources: ${data.sources.join(', ')}</div>` : '';
addMessage(data.answer + sources, 'ai');
}
});
}
function addMessage(text, sender) {
const chat = document.getElementById('chat');
const message = document.createElement('div');
message.className = `message ${sender}-message`;
message.innerHTML = text;
chat.appendChild(message);
chat.scrollTop = chat.scrollHeight;
}
document.getElementById('queryInput').addEventListener('keypress', function(e) {
if (e.key === 'Enter') askQuestion();
});
</script>
</body>
</html>
Putting It All Together
Here's how to use your RAG system:
python
# main.py
import os
from dotenv import load_dotenv
load_dotenv()
def main():
api_key = os.getenv("OPENAI_API_KEY")
# Step 1: Ingest yo
---
**Tools mentioned:**
- [Amazon](https://www.amazon.com/?tag=practicalai06-20)
Top comments (0)