RAG 시스템 실전 구축 (v44)
1. RAG 시스템의 핵심 구성 요소
RAG(Retrieval-Augmented Generation) 시스템은 검색 기반 생성을 위한 핵심 아키텍처입니다. 이 시스템은 세 가지 주요 단계로 구성됩니다:
- 검색 (Retrieval): 질문과 관련된 문서 또는 문단을 검색합니다
- 증강 (Augmentation): 검색된 정보를 프롬프트에 통합합니다
- 생성 (Generation): 증강된 프롬프트를 기반으로 답변을 생성합니다
이러한 루프를 통해 LLM은 외부 지식을 활용하여 정확하고 최신 정보를 제공할 수 있습니다.
2. 청킹 전략 (Chunking Strategies)
2.1 의미적 청킹 (Semantic Chunking)
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def chunk_by_semantic_similarity(self, text, threshold=0.7):
# 텍스트를 문장 단위로 분할
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
# 클러스터링을 통해 의미적 유사도 기반 청킹
kmeans = KMeans(n_clusters=max(1, len(sentences)//2))
kmeans.fit(embeddings)
# 같은 클러스터에 속한 문장을 하나의 청크로 결합
chunks = []
for i in range(len(np.unique(kmeans.labels_))):
cluster_sentences = [sentences[j] for j in range(len(sentences))
if kmeans.labels_[j] == i]
chunks.append('. '.join(cluster_sentences))
return chunks
2.2 재귀적 청킹 (Recursive Chunking)
class RecursiveChunker:
def __init__(self, chunk_size=512, overlap=50):
self.chunk_size = chunk_size
self.overlap = overlap
def recursive_split(self, text):
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk = text[start:end]
# 청크가 문장 중간에 끝나면, 문장 단위로 조정
if end < len(text) and text[end] != '.':
sentence_end = text.find('.', end)
if sentence_end != -1:
chunk = text[start:sentence_end + 1]
end = sentence_end + 1
chunks.append(chunk)
start = max(0, end - self.overlap)
return chunks
3. 임베딩 모델 선택 및 비교
3.1 모델 성능 비교
from sentence_transformers import SentenceTransformer
import time
import numpy as np
class EmbeddingBenchmark:
def __init__(self):
self.models = {
"all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2"),
"all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2"),
"gte-small": SentenceTransformer("sentence-transformers/gte-small")
}
def benchmark_model(self, model_name, texts):
model = self.models[model_name]
start_time = time.time()
embeddings = model.encode(texts)
end_time = time.time()
return {
"model": model_name,
"time": end_time - start_time,
"avg_time_per_text": (end_time - start_time) / len(texts),
"embedding_dim": embeddings.shape[1]
}
# 사용 예시
benchmark = EmbeddingBenchmark()
texts = ["This is a sample text for embedding", "Another example text"] * 100
results = [benchmark.benchmark_model(name, texts) for name in benchmark.models.keys()]
for result in results:
print(f"{result['model']}: {result['time']:.2f}s ({result['avg_time_per_text']:.4f}s/text)")
4. 벡터 데이터베이스 비교
4.1 Chroma vs Qdrant vs pgvector
# Chroma 예시
import chromadb
from chromadb.config import Settings
class ChromaVectorDB:
def __init__(self, collection_name="rag_collection"):
self.client = chromadb.Client(Settings(chroma_db_impl="duckdb",
persist_directory="chroma_db"))
self.collection = self.client.get_or_create_collection(collection_name)
def add_documents(self, documents, embeddings, ids):
self.collection.add(
documents=documents,
embeddings=embeddings,
ids=ids
)
def search(self, query_embedding, n_results=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
return results
# Qdrant 예시
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantVectorDB:
def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
self.client = QdrantClient(host=host, port=port)
self.collection_name = collection_name
self.client.recreate_collection(
collection_name=self.collection_name,
vectors_config={"size": 384, "distance": "Cosine"}
)
def add_documents(self, documents, embeddings, ids):
self.client.upsert(
collection_name=self.collection_name,
points=[{
"id": i,
"vector": emb.tolist(),
"payload": {"text": doc}
} for i, (doc, emb) in enumerate(zip(documents, embeddings))]
)
def search(self, query_embedding, limit=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding.tolist(),
limit=limit
)
return [hit.payload["text"] for hit in results]
# pgvector 예시
import psycopg2
from psycopg2.extras import Json
class PGVectorDB:
def __init__(self, connection_string):
self.conn = psycopg2.connect(connection_string)
self.create_table()
def create_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS embeddings (
id SERIAL PRIMARY KEY,
text TEXT,
embedding VECTOR(384)
)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_embedding ON embeddings USING ivfflat (embedding vector_cosine_ops)
""")
self.conn.commit()
def add_documents(self, documents, embeddings):
with self.conn.cursor() as cur:
for doc, emb in zip(documents, embeddings):
cur.execute(
"INSERT INTO embeddings (text, embedding) VALUES (%s, %s)",
(doc, emb.tolist())
)
self.conn.commit()
def search(self, query_embedding, limit=5):
with self.conn.cursor() as cur:
cur.execute("""
SELECT text FROM embeddings
ORDER BY embedding <-> %s
LIMIT %s
""", (query_embedding.tolist(), limit))
return [row[0] for row in cur.fetchall()]
5. RAG 파이프라인 완전 구현
python
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np
class SimpleRAGPipeline:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.embedder = SentenceTransformer(model_name)
self.chromadb_client = Client()
self.collection = self.chromadb_client.get_or_create_collection("documents")
def setup_pipeline(self, documents):
"""문서를 임베딩하고 벡터 데이터베이스에 저장"""
embeddings = self.embedder.encode(documents)
self.collection.add(
documents=documents,
embeddings=embeddings.tolist(),
ids=[str(i) for i in range(len(documents))]
)
def retrieve(self, query, top_k=3):
"""쿼리에 대한 관련 문서 검색"""
query_embedding = self.embedder.encode([query])
results = self.collection.query(
query_embeddings=query_embedding.tolist(),
n_results=top_k
)
return results['documents'][0] if results['documents'] else []
def generate_response(self, query, retrieved_docs):
"""LLM을 통한 응답 생성 (예시)"""
context = "\n\n".join(retrieved_docs)
prompt = f"다음 문맥을 바탕으로 질문에 답하세요:\n\n{context}\n\n질문: {query}"
# 실제 LLM 호출 예시 (예: using HuggingFace)
# response = pipeline(prompt, max_length=200, do_sample=False)
# return response[0]['generated_text']
return f"질문: {query}\n검색된 문맥: {context[:100]}..."
# 사용
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)