RAG 시스템 실전 구축 (v31)
개발자를 위한 실용적인 RAG 시스템 구축 가이드
1. RAG 기초 개념: 검색 → 보완 → 생성 루프
RAG (Retrieval-Augmented Generation)는 대규모 언어 모델(LLM)의 정보 제한을 극복하기 위한 아키텍처입니다. 다음의 3단계로 구성됩니다:
- 검색 (Retrieval): 질문과 관련된 문서 조각을 벡터 데이터베이스에서 찾습니다.
- 보완 (Augmentation): 검색된 문서를 프롬프트에 포함시켜 LLM이 더 정확한 응답을 생성할 수 있도록 합니다.
- 생성 (Generation): LLM은 보완된 프롬프트를 기반으로 최종 응답을 생성합니다.
# 기본 RAG 루프 구현
class BasicRAG:
def __init__(self, embedding_model, vector_db, llm):
self.embedding_model = embedding_model
self.vector_db = vector_db
self.llm = llm
def retrieve_and_generate(self, query):
# 1. 질문 임베딩
query_embedding = self.embedding_model.encode([query])
# 2. 문서 검색
relevant_docs = self.vector_db.search(query_embedding, k=5)
# 3. 프롬프트 구성
context = "\n".join([doc['text'] for doc in relevant_docs])
prompt = f"질문: {query}\n문맥: {context}\n답변:"
# 4. 생성
response = self.llm.generate(prompt)
return response
2. 청킹 전략: 의미적, 재귀적, 에이전트 기반
2.1 의미적 청킹 (Semantic Chunking)
문맥을 고려한 의미 단위로 분할합니다.
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np
class SemanticChunker:
def __init__(self, embedding_model, chunk_size=500, chunk_overlap=50):
self.embedding_model = embedding_model
self.chunker = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", " ", ""]
)
def chunk_document(self, text):
chunks = self.chunker.split_text(text)
return chunks
# 사용 예시
chunker = SemanticChunker(embedding_model)
chunks = chunker.chunk_document("문서 내용...")
2.2 재귀적 청킹 (Recursive Chunking)
문서 구조를 고려하여 여러 계층으로 분할합니다.
class RecursiveChunker:
def __init__(self, chunk_sizes=[1000, 500, 250]):
self.chunk_sizes = chunk_sizes
def recursive_chunk(self, text):
chunks = []
for size in self.chunk_sizes:
if len(text) > size:
# size 단위로 재귀적으로 분할
for i in range(0, len(text), size):
chunks.append(text[i:i+size])
return chunks
2.3 에이전트 기반 청킹 (Agentic Chunking)
문서의 주제별로 자동 분할합니다.
class AgentChunker:
def __init__(self, topic_model):
self.topic_model = topic_model
def chunk_by_topics(self, text, num_topics=3):
# 토픽 모델링을 통해 문서 분할
topics = self.topic_model.fit_predict([text])
# 각 토픽별로 청킹
return self._create_chunks_by_topic(text, topics)
3. 임베딩 모델 선택과 비교
3.1 다양한 임베딩 모델 비교
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
class EmbeddingComparison:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
'gte-small': SentenceTransformer('thenlper/gte-small')
}
def compare_models(self, texts, model_names=['all-MiniLM-L6-v2', 'bge-small-en']):
results = {}
for name in model_names:
model = self.models[name]
embeddings = model.encode(texts)
results[name] = {
'embedding_shape': embeddings.shape,
'avg_norm': np.linalg.norm(embeddings, axis=1).mean()
}
return results
# 비교 예시
comparison = EmbeddingComparison()
texts = ["문서 1 내용", "문서 2 내용"]
results = comparison.compare_models(texts)
print(results)
3.2 성능 기준: 속도 vs 정확도
import time
def benchmark_embedding_model(model, texts, iterations=10):
times = []
for _ in range(iterations):
start = time.time()
embeddings = model.encode(texts)
end = time.time()
times.append(end - start)
avg_time = sum(times) / len(times)
return {
'avg_time': avg_time,
'throughput': len(texts) / avg_time,
'embedding_size': embeddings.shape
}
# 빠른 모델 vs 정확한 모델 비교
fast_model = SentenceTransformer('all-MiniLM-L6-v2')
accurate_model = SentenceTransformer('all-mpnet-base-v2')
fast_benchmark = benchmark_embedding_model(fast_model, texts)
accurate_benchmark = benchmark_embedding_model(accurate_model, texts)
4. 벡터 데이터베이스 비교: Chroma vs Qdrant vs pgvector vs Milvus
4.1 Chroma (로컬용)
import chromadb
from chromadb import Client
class ChromaVectorDB:
def __init__(self, path="./chroma_db"):
self.client = Client(path)
self.collection = self.client.get_or_create_collection("rag_docs")
def add_documents(self, documents, embeddings, ids):
self.collection.add(
embeddings=embeddings,
documents=documents,
ids=ids
)
def search(self, query_embedding, k=5):
results = self.collection.query(
query_embeddings=query_embedding,
n_results=k
)
return [{"text": doc, "score": score}
for doc, score in zip(results['documents'][0], results['distances'][0])]
4.2 Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantVectorDB:
def __init__(self, host="localhost", port=6333):
self.client = QdrantClient(host=host, port=port)
self.collection_name = "rag_docs"
def create_collection(self, vector_size=384):
self.client.recreate_collection(
collection_name=self.collection_name,
vectors_config={"size": vector_size, "distance": "Cosine"}
)
def search(self, query_vector, k=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=k
)
return [{"id": result.id, "score": result.score, "payload": result.payload}
for result in results]
4.3 pgvector (PostgreSQL 확장)
import psycopg2
from psycopg2.extras import Json
class PGVectorDB:
def __init__(self, connection_string):
self.conn = psycopg2.connect(connection_string)
self.create_table()
def create_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS rag_documents (
id UUID PRIMARY KEY,
text TEXT,
embedding VECTOR(384),
metadata JSONB
)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents
USING ivfflat (embedding vector_cosine_ops)
""")
self.conn.commit()
def search(self, query_vector, k=5):
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, text, embedding, metadata,
1 - (embedding <-> %s) as similarity
FROM rag_documents
ORDER BY similarity DESC
LIMIT %s
""", (query_vector, k))
return cur.fetchall()
4.4 Milvus (분산 벡터 DB)
python
from pymilvus import Collection, FieldSchema, DataType, connections
class MilvusVectorDB:
def __init__(self, host="localhost", port=19530):
connections.connect("default", host=host, port=
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)