RAG 시스템 실전 구축 (v7)
Practical Guide for ML Engineers & Backend Developers
1. RAG Fundamentals: The Retrieval-Augmentation-Generation Loop
Retrieval-Augmentation-Generation (RAG) systems work in a three-step loop:
- Retrieval: Find relevant documents from a knowledge base
- Augmentation: Combine retrieved documents with the original query
- Generation: Use LLM to produce a final response
The core loop looks like this:
def rag_pipeline(query, vector_db, llm):
# 1. Retrieval
relevant_docs = vector_db.search(query, k=5)
# 2. Augmentation
augmented_prompt = format_prompt(query, relevant_docs)
# 3. Generation
response = llm.generate(augmented_prompt)
return response
This simple structure handles most real-world applications with proper chunking and embedding strategies.
2. Chunking Strategies
Semantic Chunking (Recommended for most use cases)
import tiktoken
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
def semantic_chunking(text: str, model: SentenceTransformer, max_tokens: int = 512) -> List[str]:
"""Chunk text based on semantic boundaries"""
# Split by paragraphs first
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para_tokens = len(tiktoken.encoding_for_model("gpt-4").encode(para))
if current_length + para_tokens > max_tokens and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [para]
current_length = para_tokens
else:
current_chunk.append(para)
current_length += para_tokens
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
Recursive Chunking (for documents with clear structure)
def recursive_chunking(text: str, max_size: int = 1000) -> List[str]:
"""Recursively split text by headings and sentences"""
import re
# Split by common delimiters
parts = re.split(r'(\n\s*\n)', text)
chunks = []
for part in parts:
if len(part) > max_size:
# Recursively split larger parts
sub_chunks = recursive_chunking(part, max_size)
chunks.extend(sub_chunks)
elif part.strip():
chunks.append(part.strip())
return chunks
3. Embedding Model Selection and Comparison
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
import numpy as np
class EmbeddingModel:
def __init__(self, model_name: str):
if model_name == "all-MiniLM-L6-v2":
self.model = SentenceTransformer("all-MiniLM-L6-v2")
elif model_name == "instructor-large":
self.model = INSTRUCTOR("hkunlp/instructor-large")
elif model_name == "gte-small":
self.model = SentenceTransformer("sentence-transformers/gte-small")
def encode(self, texts: List[str]) -> np.ndarray:
return self.model.encode(texts)
def get_dimension(self) -> int:
return self.model.get_sentence_embedding_dimension()
# Benchmark comparison
def benchmark_embeddings(models: List[str], test_texts: List[str]):
results = {}
for model_name in models:
model = EmbeddingModel(model_name)
start_time = time.time()
embeddings = model.encode(test_texts)
end_time = time.time()
results[model_name] = {
"time": end_time - start_time,
"dimension": model.get_dimension(),
"size_mb": len(embeddings) * model.get_dimension() * 4 / (1024*1024)
}
return results
Model Comparison (for 1000 texts):
-
all-MiniLM-L6-v2: Fast (0.5s), 384d, 1.2MB -
gte-small: Balanced (0.8s), 384d, 1.2MB -
instructor-large: Slow (1.5s), 768d, 2.8MB
Recommendation: Use all-MiniLM-L6-v2 for most applications with good speed/quality balance.
4. Vector Database Comparison
# Chroma Client
import chromadb
from chromadb import Client
class ChromaVectorDB:
def __init__(self, path: str):
self.client = Client(path)
self.collection = self.client.get_or_create_collection("documents")
def add_documents(self, documents: List[str], ids: List[str]):
embeddings = self.embedder.encode(documents)
self.collection.add(
embeddings=embeddings,
documents=documents,
ids=ids
)
def search(self, query: str, k: int = 5) -> List[dict]:
query_embedding = self.embedder.encode([query])[0]
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=k,
include=['documents', 'distances']
)
return results['documents'][0]
# Qdrant Client
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition
class QdrantVectorDB:
def __init__(self, host: str, port: int):
self.client = QdrantClient(host=host, port=port)
self.collection_name = "documents"
def add_documents(self, documents: List[str], ids: List[str]):
points = [
{
"id": i,
"vector": self.embedder.encode([doc])[0],
"payload": {"text": doc}
} for i, doc in enumerate(documents)
]
self.client.upsert(
collection_name=self.collection_name,
points=points
)
def search(self, query: str, k: int = 5) -> List[dict]:
query_vector = self.embedder.encode([query])[0]
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=k
)
return [hit.payload['text'] for hit in results]
# pgvector with PostgreSQL
import psycopg2
from psycopg2.extras import Json
class PgVectorDB:
def __init__(self, connection_string: str):
self.conn = psycopg2.connect(connection_string)
self.create_table()
def create_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT,
embedding VECTOR(384),
metadata JSONB
)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_embedding
ON documents USING ivfflat (embedding vector_l2_ops)
""")
self.conn.commit()
def add_documents(self, documents: List[str], metadata: List[dict]):
with self.conn.cursor() as cur:
for doc, meta in zip(documents, metadata):
embedding = self.embedder.encode([doc])[0]
cur.execute(
"INSERT INTO documents (content, embedding, metadata) VALUES (%s, %s, %s)",
(doc, embedding.tolist(), Json(meta))
)
self.conn.commit()
def search(self, query: str, k: int = 5) -> List[dict]:
query_embedding = self.embedder.encode([query])[0]
with self.conn.cursor() as cur:
cur.execute("""
SELECT content, metadata FROM documents
ORDER BY embedding <-> %s
LIMIT %s
""", (query_embedding.tolist(), k))
results = cur.fetchall()
return [r[0] for r in results]
# Benchmark Results:
# Chroma: Fastest (0.1s), Local-only, good for dev
# Qdrant: Best for production, distributed, good performance
# PgVector: Most scalable, best for large datasets
# Milvus: Highest performance, best for enterprise
5. Full RAG Pipeline Code from Scratch
python
import os
import json
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client
class RAGSystem:
def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
self.embedding_model = SentenceTransformer(embedding_model_name)
self.client = Client()
self.collection = self.client.get_or_create_collection("docs")
def add_documents(self, documents: List[Dict[str, Any]]):
"""Add documents to the system"""
texts = [doc['content'] for doc in documents]
ids = [doc['id'] for doc in documents]
# Chunk documents
chunked_texts = []
chunked_ids = []
for doc, doc_id in zip(documents, ids):
chunks = semantic_chunking(doc['content'], self.embedding_model)
for i, chunk in enumerate(chunks):
chunked_texts.append(chunk)
chunked_ids.append(f"{doc_id}_chunk_{i}")
# Create embeddings
embeddings = self.embedding_model.encode(chunked_texts)
# Store in vector DB
self.collection.add(
embeddings
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)