RAG 시스템 실전 구축 (v24)
1. RAG Fundamentals: The Three-Step Loop
Retrieval-Augmented Generation (RAG) combines the strengths of retrieval systems and generative models. The core loop consists of:
- Retrieval: Find relevant documents from a knowledge base
- Augmentation: Combine retrieved documents with the query
- Generation: Produce a response using the augmented context
# Basic RAG pipeline structure
class BasicRAG:
def __init__(self, embedder, vector_db, generator):
self.embedder = embedder
self.vector_db = vector_db
self.generator = generator
def process_query(self, query):
# Step 1: Embed query
query_embedding = self.embedder.embed([query])[0]
# Step 2: Retrieve relevant documents
retrieved_docs = self.vector_db.search(query_embedding, k=5)
# Step 3: Generate response
context = " ".join([doc['content'] for doc in retrieved_docs])
response = self.generator.generate(query, context)
return response
2. Chunking Strategies: Finding the Right Balance
Semantic Chunking: Uses sentence transformers to identify natural breaks in text.
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def chunk_text(self, text, min_chunk_size=100):
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
# Use KMeans to cluster sentences by semantic similarity
n_clusters = max(1, len(sentences) // 3)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)
chunks = []
current_chunk = []
current_cluster = clusters[0]
for i, (sentence, cluster) in enumerate(zip(sentences, clusters)):
if cluster != current_cluster and len(current_chunk) > 0:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_cluster = cluster
current_chunk.append(sentence)
if current_chunk:
chunks.append(' '.join(current_chunk))
return [chunk for chunk in chunks if len(chunk) >= min_chunk_size]
# Example usage
chunker = SemanticChunker()
text = "Large language models are powerful AI systems. They can understand context and generate human-like text. Training these models requires significant computational resources."
chunks = chunker.chunk_text(text)
print(f"Created {len(chunks)} chunks")
Recursive Chunking: Breaks down text recursively with overlap.
class RecursiveChunker:
def __init__(self, chunk_size=512, overlap=64):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_text(self, text):
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
start = end - self.overlap
return chunks
# Benchmark comparison
def benchmark_chunking_methods():
sample_text = "This is a test document. It contains multiple sentences. Each sentence should be properly chunked for optimal retrieval performance. The chunking method should balance context preservation with token efficiency."
semantic_chunker = SemanticChunker()
recursive_chunker = RecursiveChunker()
semantic_chunks = semantic_chunker.chunk_text(sample_text)
recursive_chunks = recursive_chunker.chunk_text(sample_text)
print(f"Semantic chunks: {len(semantic_chunks)}")
print(f"Recursive chunks: {len(recursive_chunks)}")
3. Embedding Model Selection and Comparison
Choosing the right embedding model is crucial for RAG performance.
from sentence_transformers import SentenceTransformer
import time
class EmbeddingBenchmark:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': 'fast',
'all-mpnet-base-v2': 'medium',
'multi-qa-MiniLM-L6-v2': 'fast',
'paraphrase-multilingual-MiniLM-v2': 'medium'
}
def benchmark_model(self, model_name, test_sentences, batch_size=32):
model = SentenceTransformer(model_name)
# Warmup
_ = model.encode([test_sentences[0]])
start_time = time.time()
embeddings = model.encode(test_sentences, batch_size=batch_size)
end_time = time.time()
avg_time = (end_time - start_time) / len(test_sentences)
return {
'model': model_name,
'avg_time_per_sentence': avg_time,
'embedding_dim': len(embeddings[0]),
'memory_usage': len(embeddings) * len(embeddings[0]) * 4 # float32
}
# Quick benchmark
benchmark = EmbeddingBenchmark()
test_sentences = [
"This is the first test sentence.",
"This is the second test sentence.",
"This is the third test sentence."
]
results = []
for model_name in benchmark.models:
result = benchmark.benchmark_model(model_name, test_sentences)
results.append(result)
print(f"{result['model']}: {result['avg_time_per_sentence']:.4f}s per sentence")
4. Vector Database Comparison
# Chroma Implementation
import chromadb
from chromadb.utils import embedding_functions
class ChromaVectorDB:
def __init__(self, collection_name="rag_collection"):
self.client = chromadb.Client()
self.collection = self.client.get_or_create_collection(
name=collection_name,
embedding_function=embedding_functions.DefaultEmbeddingFunction()
)
def add_documents(self, documents, ids):
self.collection.add(
documents=documents,
ids=ids
)
def search(self, query_embedding, k=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=k
)
return [{'content': doc, 'score': score}
for doc, score in zip(results['documents'][0], results['distances'][0])]
# Qdrant Implementation
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantVectorDB:
def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
self.client = QdrantClient(host=host, port=port)
self.collection_name = collection_name
# Create collection if it doesn't exist
try:
self.client.get_collection(collection_name)
except:
self.client.create_collection(
collection_name=collection_name,
vectors_config={"size": 384, "distance": "Cosine"}
)
def add_documents(self, documents, ids):
points = [
{
"id": id,
"vector": self._get_embedding(doc),
"payload": {"content": doc}
}
for id, doc in zip(ids, documents)
]
self.client.upsert(
collection_name=self.collection_name,
points=points
)
def search(self, query_embedding, k=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding,
limit=k
)
return [{'content': point.payload['content'], 'score': point.score}
for point in results]
# Performance benchmark
def compare_vector_dbs():
# Sample data
documents = [
"Machine learning models require large datasets for training.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing helps computers understand text.",
"Computer vision enables machines to interpret visual information."
]
# Test Chroma
chroma_db = ChromaVectorDB("chroma_test")
chroma_db.add_documents(documents, [f"doc_{i}" for i in range(len(documents))])
# Test Qdrant (requires Qdrant server running)
# qdrant_db = QdrantVectorDB()
# qdrant_db.add_documents(documents, [f"doc_{i}" for i in range(len(documents))])
print("Vector DB comparison completed")
5. Full RAG Pipeline from Scratch
python
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.utils import embedding_functions
import json
class CompleteRAGPipeline:
def __init__(self, model_name="all-MiniLM-L6-v2"):
# Initialize components
self.embedder = SentenceTransformer(model_name)
self.client = Client()
self.collection = self.client.get_or_create_collection(
name="rag_knowledge_base",
embedding_function=embedding_functions.DefaultEmbeddingFunction()
)
self.context_window = 2048 # Max tokens for context
def add_documents(self, documents, ids, metadata=None):
"""Add documents to the knowledge base"""
self.collection.add(
documents=documents,
ids=ids,
metadatas=metadata or [{}] * len(documents)
)
def retrieve_documents(self, query, k=5):
"""Retrieve relevant documents"""
query_embedding = self.embed
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)