RAG 시스템 실전 구축 (v27)
1. RAG 기초 개념
Retrieval-Augmented Generation (RAG)은 정보 검색과 생성 모델의 결합으로, 기존 지식 베이스를 활용해 정확한 답변을 생성하는 시스템입니다. RAG의 핵심 루프는 다음과 같습니다:
- 검색 (Retrieval): 사용자 쿼리와 유사한 문서 조각 찾기
- 보강 (Augmentation): 검색된 문서를 프롬프트에 통합
- 생성 (Generation): LLM이 보강된 프롬프트를 기반으로 답변 생성
# 간단한 RAG 루프 구현
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class SimpleRAG:
def __init__(self, embeddings, documents, vector_db):
self.embeddings = embeddings
self.documents = documents
self.vector_db = vector_db
def retrieve(self, query, k=3):
query_embedding = self.embeddings.encode([query])[0]
retrieved_indices = self.vector_db.search(query_embedding, k)
return [self.documents[i] for i in retrieved_indices]
def generate(self, query, retrieved_docs):
# 간단한 프롬프트 구성
context = "\n".join(retrieved_docs)
prompt = f"질문: {query}\n문맥: {context}\n답변:"
return self.llm.generate(prompt)
2. Chunking 전략
문서를 적절한 크기로 나누는 것이 중요합니다. 세 가지 주요 전략:
Semantic Chunking
의미 기반으로 문서 분할
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def chunk_by_semantic(self, text, threshold=0.75):
sentences = text.split('.')
embeddings = self.model.encode(sentences)
chunks = []
current_chunk = []
current_embedding = None
for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
if i == 0:
current_chunk.append(sentence)
current_embedding = embedding
else:
similarity = cosine_similarity([current_embedding], [embedding])[0][0]
if similarity > threshold:
current_chunk.append(sentence)
else:
chunks.append('.'.join(current_chunk))
current_chunk = [sentence]
current_embedding = embedding
if current_chunk:
chunks.append('.'.join(current_chunk))
return chunks
Recursive Chunking
재귀적으로 작은 조각으로 분할
def recursive_chunking(text, max_chunk_size=512, overlap=50):
if len(text) <= max_chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = min(start + max_chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
start = end - overlap
return chunks
Agentic Chunking
LLM 기반 자동 분할
from openai import OpenAI
class AgenticChunker:
def __init__(self, client):
self.client = client
def chunk_with_llm(self, text, max_tokens=200):
prompt = f"""
다음 텍스트를 {max_tokens} 토큰 이내로 분할해주세요:
{text}
분할된 각 조각은 의미가 완전한 문장이어야 하며,
최대한 의미를 유지하면서 분할해주세요.
"""
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=1000
)
# 응답 파싱 및 분할
return response.choices[0].message.content.split('\n')
3. 임베딩 모델 선택 및 비교
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
class EmbeddingBenchmark:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
'sentence-t5-encoder': SentenceTransformer('sentence-t5-encoder')
}
def evaluate_models(self, texts):
results = {}
for name, model in self.models.items():
embeddings = model.encode(texts)
results[name] = {
'embedding_size': embeddings.shape[1],
'memory_usage': embeddings.nbytes / (1024**2), # MB
'speed': len(texts) / 1000 # 예시 속도
}
return results
# 사용 예시
benchmark = EmbeddingBenchmark()
texts = ["문서 1 내용", "문서 2 내용", "문서 3 내용"]
results = benchmark.evaluate_models(texts)
4. 벡터 데이터베이스 비교
Chroma
import chromadb
from chromadb import Client
class ChromaVectorDB:
def __init__(self, collection_name="rag_collection"):
self.client = Client()
self.collection = self.client.get_or_create_collection(collection_name)
def add_documents(self, documents, embeddings, ids):
self.collection.add(
documents=documents,
embeddings=embeddings,
ids=ids
)
def search(self, query_embedding, n_results=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
return results['ids'][0]
Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantVectorDB:
def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
self.client = QdrantClient(host=host, port=port)
self.collection_name = collection_name
# 컬렉션 생성
self.client.recreate_collection(
collection_name=collection_name,
vectors_config={"size": 384, "distance": "Cosine"}
)
def add_documents(self, documents, embeddings, ids):
points = [
{
"id": idx,
"vector": embedding.tolist(),
"payload": {"text": doc}
}
for idx, (doc, embedding) in enumerate(zip(documents, embeddings))
]
self.client.upsert(collection_name=self.collection_name, points=points)
def search(self, query_embedding, n_results=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding.tolist(),
limit=n_results
)
return [point.id for point in results]
5. 완전한 RAG 파이프라인 구현
python
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client
class CompleteRAGPipeline:
def __init__(self, model_name="all-MiniLM-L6-v2"):
# 1. 임베딩 모델 초기화
self.embedding_model = SentenceTransformer(model_name)
# 2. 벡터 데이터베이스 초기화
self.client = Client()
self.collection = self.client.get_or_create_collection("rag_docs")
# 3. LLM 초기화 (예시로 LLaMA 사용)
from transformers import LlamaTokenizer, LlamaForCausalLM
self.tokenizer = LlamaTokenizer.from_pretrained("llama-2-7b")
self.model = LlamaForCausalLM.from_pretrained("llama-2-7b")
def setup_document_store(self, documents):
"""문서 저장소 설정"""
embeddings = self.embedding_model.encode(documents)
# 문서 ID 생성
ids = [f"doc_{i}" for i in range(len(documents))]
# 벡터 저장
self.collection.add(
documents=documents,
embeddings=embeddings.tolist(),
ids=ids
)
print(f"총 {len(documents)}개 문서가 저장되었습니다.")
def retrieve(self, query, k=3):
"""문서 검색"""
query_embedding = self.embedding_model.encode([query])[0]
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=k
)
return results['documents'][0]
def generate_response(self, query, retrieved_docs):
"""답변 생성"""
context = "\n".join(retrieved_docs)
prompt = f"""
질문: {query}
문맥: {context}
답변:
"""
# 토크나이저로 프롬프트 인
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)