RAG 시스템 실전 구축 (v5)
Practical Guide for ML Engineers & Backend Developers
Product: $5 RAG Pipeline Starter Kit with 500+ Code Examples
1. RAG Fundamentals: The Retrieval-Augmentation-Generation Loop
Retriever-Augmenter-Generator (RAG) systems work in three phases:
# Basic RAG workflow
def rag_pipeline(query, vector_db, llm):
# 1. RETRIEVAL: Find relevant documents
relevant_docs = vector_db.search(query, top_k=5)
# 2. AUGMENTATION: Combine context with query
augmented_prompt = format_prompt(query, relevant_docs)
# 3. GENERATION: Generate response
response = llm.generate(augmented_prompt)
return response
Key components:
- Retriever: Finds relevant documents (vector similarity search)
- Augmenter: Formats context into prompt
- Generator: LLM produces final answer
2. Chunking Strategies: Breaking Down Documents Efficiently
Semantic Chunking (Best for code understanding):
import tiktoken
from sentence_transformers import SentenceTransformer
def semantic_chunking(text, model, max_tokens=512):
sentences = text.split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
# Estimate token count
if len(tokenize(current_chunk + sentence)) > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
else:
current_chunk += sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
Recursive Chunking (For structured data):
def recursive_chunking(text, chunk_size=1000):
"""Split text recursively by delimiters"""
delimiters = ['\n\n', '\n', '. ', ' ', '']
chunks = [text]
for delim in delimiters:
new_chunks = []
for chunk in chunks:
if len(chunk) > chunk_size:
new_chunks.extend(chunk.split(delim))
else:
new_chunks.append(chunk)
chunks = new_chunks
if all(len(chunk) <= chunk_size for chunk in chunks):
break
return [chunk for chunk in chunks if len(chunk) > 10]
3. Embedding Model Selection and Comparison
Top 5 Models for Code RAG:
# Model comparison benchmark
models = {
"all-MiniLM-L6-v2": {
"dimensions": 384,
"size_mb": 100,
"speed": "fast",
"code_sensitivity": "high"
},
"Sentence-BERT": {
"dimensions": 768,
"size_mb": 400,
"speed": "medium",
"code_sensitivity": "very_high"
},
"CodeBERT": {
"dimensions": 768,
"size_mb": 1000,
"speed": "medium",
"code_sensitivity": "excellent"
},
"MiniLM": {
"dimensions": 384,
"size_mb": 50,
"speed": "fast",
"code_sensitivity": "medium"
},
"MPNet": {
"dimensions": 768,
"size_mb": 400,
"speed": "slow",
"code_sensitivity": "high"
}
}
# Implementation example
class EmbeddingService:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def encode(self, texts):
return self.model.encode(texts)
def get_dimensions(self):
return self.model.get_sentence_embedding_dimension()
# Benchmark function
def benchmark_embeddings():
service = EmbeddingService()
texts = ["Sample code snippet"] * 1000
import time
start = time.time()
embeddings = service.encode(texts)
end = time.time()
print(f"Time for 1000 embeddings: {end-start:.2f}s")
return embeddings
4. Vector Database Comparison
| Database | Speed | Memory | Cost | Best For |
|---|---|---|---|---|
| Chroma | Fast | Low | Free | Development |
| Qdrant | Fast | Medium | Free | Production |
| pgvector | Medium | High | Free | PostgreSQL users |
| Milvus | Fast | High | Free | Large-scale |
# Chroma implementation
import chromadb
class ChromaVectorStore:
def __init__(self, collection_name="code_docs"):
self.client = chromadb.Client()
self.collection = self.client.get_or_create_collection(collection_name)
def add_documents(self, documents, embeddings):
self.collection.add(
documents=documents,
embeddings=embeddings,
ids=[str(i) for i in range(len(documents))]
)
def search(self, query, top_k=5):
result = self.collection.query(
query_texts=[query],
n_results=top_k
)
return result['documents'][0]
# Qdrant implementation
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition
class QdrantVectorStore:
def __init__(self, host="localhost", port=6333, collection_name="code_docs"):
self.client = QdrantClient(host=host, port=port)
self.collection_name = collection_name
def search(self, query_vector, top_k=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=top_k
)
return [hit.payload['text'] for hit in results]
5. Full RAG Pipeline from Scratch
import os
import json
from pathlib import Path
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client
class CodeRAGPipeline:
def __init__(self, embedding_model="all-MiniLM-L6-v2", db_path="./chroma_db"):
self.embedding_model = SentenceTransformer(embedding_model)
self.client = chromadb.PersistentClient(path=db_path)
self.collection = self.client.get_or_create_collection("code_documents")
self.chunk_size = 1000
def preprocess_code_files(self, file_path: str) -> List[str]:
"""Extract code chunks from file"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Simple chunking by function/class boundaries
lines = content.split('\n')
chunks = []
current_chunk = []
for line in lines:
if line.strip().startswith(('def ', 'class ')) and current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = [line]
else:
current_chunk.append(line)
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def embed_and_store(self, documents: List[str], file_path: str):
"""Generate embeddings and store in vector DB"""
embeddings = self.embedding_model.encode(documents)
self.collection.add(
documents=documents,
embeddings=embeddings.tolist(),
ids=[f"{file_path}_{i}" for i in range(len(documents))]
)
def query(self, query: str, top_k: int = 5) -> List[Dict]:
"""Query the RAG system"""
query_embedding = self.embedding_model.encode([query])
results = self.collection.query(
query_embeddings=query_embedding.tolist(),
n_results=top_k
)
return [
{
'text': doc,
'distance': dist
}
for doc, dist in zip(results['documents'][0], results['distances'][0])
]
def generate_response(self, query: str, retrieved_docs: List[Dict]) -> str:
"""Generate final response using LLM"""
context = "\n\n".join([doc['text'] for doc in retrieved_docs[:3]])
prompt = f"""
Context: {context}
Question: {query}
Answer:
"""
# This would integrate with your LLM (e.g., llama.cpp)
return f"Based on {len(retrieved_docs)} documents, the answer is..."
6. Advanced Techniques: Query Transformation & Hybrid Search
Query Transformation:
python
def transform_query(original_query: str) -> List[str]:
"""Generate multiple query variations"""
transformations = [
original_query,
f"Explain {original_query}",
f"How to implement {original_query}",
f"Code example for {original_query}",
f"Best practices for {original_query}"
]
return transformations
class HybridSearchRAG:
def __init__(self):
self.vector_store = ChromaVectorStore()
self.text_store = TextSearchEngine()
def hybrid_search(self, query: str, top_k: int = 5):
"""Combine vector and keyword search"""
# Vector search
vector_results = self.vector_store.search(query, top_k)
# Keyword search (simplified)
keyword_results = self.text_store.search(query, top_k)
# Combine results with weighted scoring
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)