RAG 시스템 실전 구축 (v38)
Real-World RAG Implementation Guide for ML Engineers
1. RAG Fundamentals: The Core Loop
Retrieval-Augmented Generation (RAG) is a powerful pattern that combines information retrieval with language generation. The core loop consists of three phases:
- Retrieval: Find relevant documents from a knowledge base
- Augmentation: Inject retrieved context into prompts
- Generation: Generate responses using the augmented prompt
# Simplified RAG Loop
class BasicRAG:
def __init__(self, vector_db, embedding_model, llm):
self.vector_db = vector_db
self.embedding_model = embedding_model
self.llm = llm
def query(self, user_query):
# 1. Retrieve relevant documents
query_embedding = self.embedding_model.encode(user_query)
relevant_docs = self.vector_db.search(query_embedding, k=5)
# 2. Augment prompt with context
context = "\n".join([doc.content for doc in relevant_docs])
augmented_prompt = f"Context: {context}\n\nQuestion: {user_query}"
# 3. Generate response
response = self.llm.generate(augmented_prompt)
return response
2. Chunking Strategies
Effective document chunking is critical for retrieval quality. Here are the main approaches:
Semantic Chunking
import numpy as np
from sentence_transformers import SentenceTransformer
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def chunk_by_semantic(self, text, max_tokens=512):
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
# Group sentences based on semantic similarity
chunks = []
current_chunk = []
current_embedding = np.zeros(embeddings[0].shape)
for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
if len(current_chunk) > 0:
similarity = np.dot(current_embedding, embedding) / (
np.linalg.norm(current_embedding) * np.linalg.norm(embedding)
)
if similarity < 0.7 or len(current_chunk) > 20: # threshold
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_embedding = embedding
else:
current_chunk.append(sentence)
# Update average embedding
current_embedding = (current_embedding + embedding) / 2
else:
current_chunk.append(sentence)
current_embedding = embedding
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
Recursive Chunking
class RecursiveChunker:
def __init__(self, max_chunk_size=512, overlap=50):
self.max_chunk_size = max_chunk_size
self.overlap = overlap
def chunk_recursive(self, text):
chunks = []
def split_recursive(text, start=0, depth=0):
if len(text) <= self.max_chunk_size or depth > 5:
chunks.append(text)
return
# Try to split at sentence boundaries first
split_point = text.rfind('. ', start, start + self.max_chunk_size)
if split_point == -1:
split_point = start + self.max_chunk_size
chunks.append(text[start:split_point])
next_start = max(0, split_point - self.overlap)
split_recursive(text, next_start, depth + 1)
split_recursive(text)
return chunks
3. Embedding Model Selection
Choosing the right embedding model affects both performance and cost:
# Model comparison benchmark
import time
from sentence_transformers import SentenceTransformer
def benchmark_embeddings():
models = {
"all-MiniLM-L6-v2": {
"dimensions": 384,
"size_mb": 80,
"speed": "fast"
},
"all-mpnet-base-v2": {
"dimensions": 768,
"size_mb": 400,
"speed": "medium"
},
"BAAI/bge-small-en": {
"dimensions": 512,
"size_mb": 120,
"speed": "fast"
}
}
test_sentences = [
"The quick brown fox jumps over the lazy dog",
"Machine learning models require large datasets",
"Natural language processing enables human-like interactions"
]
for name, config in models.items():
model = SentenceTransformer(name)
start = time.time()
embeddings = model.encode(test_sentences)
end = time.time()
print(f"{name}: {end-start:.2f}s for {len(test_sentences)} sentences")
print(f" Dimensions: {config['dimensions']}, Size: {config['size_mb']}MB")
# Benchmark output:
# all-MiniLM-L6-v2: 0.15s for 3 sentences
# all-mpnet-base-v2: 0.35s for 3 sentences
# BAAI/bge-small-en: 0.20s for 3 sentences
4. Vector Database Comparison
| Database | Pros | Cons | Best For |
|---|---|---|---|
| Chroma | Easy setup, Python native, good for dev | Limited scalability | Local/development |
| Qdrant | High performance, advanced filtering | Complex setup | Production |
| pgvector | PostgreSQL integration, ACID | Requires PostgreSQL | Existing SQL systems |
| Milvus | Scalable, distributed | Steep learning curve | Large deployments |
# Example implementation with different vector DBs
class VectorDBFactory:
@staticmethod
def create_vector_db(db_type, **kwargs):
if db_type == "chroma":
import chromadb
client = chromadb.Client()
return chromadb.Collection(client, **kwargs)
elif db_type == "qdrant":
from qdrant_client import QdrantClient
client = QdrantClient(**kwargs)
return client
elif db_type == "pgvector":
import psycopg2
conn = psycopg2.connect(**kwargs)
return conn
elif db_type == "milvus":
from pymilvus import Collection
return Collection(**kwargs)
5. Full RAG Pipeline Implementation
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings
from typing import List, Dict
import json
class CompleteRAGPipeline:
def __init__(self, model_name="all-MiniLM-L6-v2", db_path="./chroma_db"):
# Initialize components
self.embedding_model = SentenceTransformer(model_name)
self.vector_client = Client(Settings(persist_directory=db_path))
self.collection = self.vector_client.get_or_create_collection("documents")
# Simple LLM placeholder (replace with actual implementation)
self.llm = self._simple_llm_response
def _simple_llm_response(self, prompt):
# This would be replaced with actual LLM call
return f"Generated response to: {prompt[:50]}..."
def add_documents(self, documents: List[Dict]):
"""Add documents to the vector database"""
embeddings = self.embedding_model.encode([doc['content'] for doc in documents])
# Add to Chroma
self.collection.add(
embeddings=embeddings,
documents=[doc['content'] for doc in documents],
metadatas=[doc.get('metadata', {}) for doc in documents],
ids=[doc['id'] for doc in documents]
)
def search_and_generate(self, query: str, top_k: int = 5):
"""Main RAG workflow"""
# 1. Retrieve
query_embedding = self.embedding_model.encode([query])
results = self.collection.query(
query_embeddings=query_embedding,
n_results=top_k,
include=['documents', 'metadatas']
)
# 2. Augment
retrieved_docs = results['documents'][0]
context = "\n---\n".join(retrieved_docs)
augmented_prompt = f"""
Context: {context}
Question: {query}
Answer:"""
# 3. Generate
response = self.llm(augmented_prompt)
return {
"query": query,
"context": context,
"response": response,
"retrieved_docs": retrieved_docs
}
# Usage example
pipeline = CompleteRAGPipeline()
# Add sample documents
sample_docs = [
{
"id": "1",
"content": "The capital of France is Paris. Paris is known for the Eiffel Tower.",
"metadata": {"source": "wiki"}
},
{
"id": "2",
"content": "Machine learning is a subset of artificial intelligence that focuses on algorithms.",
"metadata": {"source": "tech_blog"}
}
]
pipeline.add_documents(sample_docs)
result = pipeline.search_and_generate("What is the capital of France?")
print(json.dumps(result, indent=2, ensure_ascii=False))
6. Advanced Techniques
Query Transformation
python
class QueryTransformer:
def __init__(self):
self.transformations = [
self.expand_query,
self.rephrase_query,
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)