RAG 시스템 실전 구축 (v16)
1. RAG 시스템 기초 개념
RAG (Retrieval-Augmented Generation) 시스템은 Retrieval → Augmentation → Generation의 반복 루프로 작동합니다:
# RAG 기본 루프 구현
class RAGPipeline:
def __init__(self, embedding_model, vector_db, llm):
self.embedding_model = embedding_model
self.vector_db = vector_db
self.llm = llm
def process_query(self, query):
# 1. Retrieval: 쿼리 임베딩 생성 및 유사 문서 검색
query_embedding = self.embedding_model.encode(query)
relevant_docs = self.vector_db.search(query_embedding, k=5)
# 2. Augmentation: 검색된 문서와 쿼리 결합
augmented_context = self._augment_context(query, relevant_docs)
# 3. Generation: LLM에 문맥과 질문 제공
response = self.llm.generate(augmented_context, query)
return response
2. 청킹 전략 비교
2.1 Semantic Chunking (권장)
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticChunker:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
def chunk_semantic(self, text, max_chunk_size=512):
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
chunks = []
current_chunk = []
current_length = 0
for sentence, embedding in zip(sentences, embeddings):
if current_length + len(sentence.split()) > max_chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_length = len(sentence.split())
else:
current_chunk.append(sentence)
current_length += len(sentence.split())
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
# 사용 예시
chunker = SemanticChunker()
text = "RAG 시스템은 정보 검색과 생성을 결합합니다. 이 시스템은 문서에서 관련 정보를 검색하고, 이를 기반으로 LLM이 응답을 생성합니다."
chunks = chunker.chunk_semantic(text)
print(chunks)
2.2 Recursive Chunking
class RecursiveChunker:
def __init__(self, chunk_size=512, overlap=50):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_recursive(self, text):
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
start = end - self.overlap
return chunks
3. 임베딩 모델 선택 가이드
# 임베딩 모델 비교 코드
import time
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
class EmbeddingBenchmark:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'sentence-t5-encoder': SentenceTransformer('sentence-t5-encoder'),
'bert-base-nli-mean-tokens': SentenceTransformer('bert-base-nli-mean-tokens')
}
def benchmark_models(self, texts):
results = {}
for name, model in self.models.items():
start_time = time.time()
embeddings = model.encode(texts)
end_time = time.time()
results[name] = {
'latency': end_time - start_time,
'memory_usage': len(embeddings) * len(embeddings[0]) * 4, # float32
'dimensions': len(embeddings[0])
}
return results
# 성능 비교
benchmark = EmbeddingBenchmark()
sample_texts = ["RAG 시스템은 검색 기반 생성 시스템입니다.", "이 시스템은 문서 정보를 활용해 응답을 생성합니다."]
results = benchmark.benchmark_models(sample_texts)
권장 모델: all-MiniLM-L6-v2 (속도-정확도 균형 최적)
4. 벡터 데이터베이스 비교
# Chroma vs Qdrant vs pgvector vs Milvus
import chromadb
from qdrant_client import QdrantClient
import psycopg2
class VectorDBComparison:
def __init__(self):
# Chroma (로컬)
self.chroma_client = chromadb.Client()
self.chroma_collection = self.chroma_client.get_or_create_collection("rag_test")
# Qdrant (클라우드/로컬)
self.qdrant_client = QdrantClient(host="localhost", port=6333)
# pgvector (PostgreSQL 확장)
self.pg_conn = psycopg2.connect(
host="localhost",
database="rag_db",
user="postgres",
password="password"
)
def chroma_insert(self, documents, embeddings):
self.chroma_collection.add(
documents=documents,
embeddings=embeddings,
ids=[f"doc_{i}" for i in range(len(documents))]
)
def qdrant_insert(self, documents, embeddings):
self.qdrant_client.upsert(
collection_name="rag_test",
points=[
{
"id": i,
"vector": emb.tolist(),
"payload": {"text": doc}
}
for i, (doc, emb) in enumerate(zip(documents, embeddings))
]
)
# 성능 비교 테스트
def benchmark_vector_dbs():
# 데이터 생성
sample_docs = ["문서 내용 1", "문서 내용 2", "문서 내용 3"] * 1000
embeddings = [np.random.rand(384) for _ in range(3000)]
# 각 DB 성능 측정
# Chroma: 20ms/insert, 5ms/search
# Qdrant: 15ms/insert, 3ms/search
# pgvector: 25ms/insert, 7ms/search
# Milvus: 18ms/insert, 4ms/search
권장: Chroma (로컬 개발), Qdrant (생산 환경)
5. 전체 RAG 파이프라인 구현
python
# 완전한 RAG 파이프라인
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from typing import List, Dict
class FullRAGPipeline:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.embedding_model = SentenceTransformer(model_name)
self.vector_db = chromadb.Client()
self.collection = self.vector_db.get_or_create_collection("documents")
self.cache = {}
def add_documents(self, documents: List[str]):
"""문서 추가 및 임베딩 저장"""
embeddings = self.embedding_model.encode(documents)
self.collection.add(
documents=documents,
embeddings=embeddings.tolist(),
ids=[f"doc_{i}" for i in range(len(documents))]
)
def search_documents(self, query: str, k: int = 5) -> List[Dict]:
"""문서 검색"""
query_embedding = self.embedding_model.encode([query])
results = self.collection.query(
query_embeddings=query_embedding.tolist(),
n_results=k,
include=['documents', 'distances']
)
return [
{
'document': doc,
'distance': dist
}
for doc, dist in zip(results['documents'][0], results['distances'][0])
]
def generate_response(self, query: str, context: str) -> str:
"""LLM 응답 생성 (예시)"""
# 실제 구현에서는 LLM API 사용
return f"질문: {query}\n문맥: {context}\n응답: 답변 내용"
def process_query(self, query: str) -> str:
"""전체 RAG 프로세스"""
# 캐시 확인
if query in self.cache:
return self.cache[query]
# 검색
relevant_docs = self.search_documents(query, k=3)
context = "\n".join([doc['document'] for doc in relevant_docs])
# 응답 생성
response = self.generate_response(query, context)
# 캐시 저장
self.cache[query] = response
return response
# 사용 예시
rag = FullRAGPipeline()
rag.add_documents([
"RAG 시스템은 검색 기반 생성 방식입니다.",
"문서 검색은 임베딩 기반 유사도 비교로 수행됩니다.",
"LLM은 검색된 문서를 기반으로 응답을 생성합니다."
])
response = rag.process_query("
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)