RAG 시스템 실전 구축 (v4)
1. RAG 기초 개념: 검색 → 보완 → 생성 루프
Retrieval-Augmented Generation (RAG)은 대규모 언어 모델(LLM)의 지식 범위를 확장하기 위한 아키텍처입니다. RAG는 세 가지 핵심 구성 요소로 구성됩니다:
- 검색 (Retrieval): 사용자의 질문과 관련된 문서 또는 텍스트 조각을 찾습니다
- 보완 (Augmentation): 검색된 정보를 질문과 함께 LLM 입력으로 제공합니다
- 생성 (Generation): LLM이 검색된 컨텍스트와 질문을 기반으로 답변을 생성합니다
# 간단한 RAG 루프 구현
class SimpleRAG:
def __init__(self, embedding_model, vector_db):
self.embedding_model = embedding_model
self.vector_db = vector_db
def retrieve(self, query, k=5):
query_embedding = self.embedding_model.encode(query)
return self.vector_db.search(query_embedding, k=k)
def generate(self, query, retrieved_docs):
context = "\n".join(retrieved_docs)
prompt = f"질문: {query}\n컨텍스트: {context}\n답변:"
return self.llm.generate(prompt)
def run(self, query):
docs = self.retrieve(query)
return self.generate(query, docs)
2. 청킹 전략: 의미론적, 재귀적, 에이전트 기반
청킹은 대용량 텍스트를 모델이 처리할 수 있는 조각으로 나누는 과정입니다:
의미론적 청킹 (Semantic Chunking)
문서의 의미적 단위를 기준으로 청킹합니다:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
def semantic_chunking(text, model, threshold=0.75):
sentences = text.split('. ')
embeddings = model.encode(sentences)
# 유사도 기반 클러스터링
similarity_matrix = np.dot(embeddings, embeddings.T)
chunks = []
current_chunk = []
for i, sentence in enumerate(sentences):
if not current_chunk:
current_chunk.append(sentence)
else:
# 가장 최근 문장과의 유사도 계산
recent_embedding = embeddings[i-1]
current_embedding = embeddings[i]
similarity = np.dot(recent_embedding, current_embedding) / (
np.linalg.norm(recent_embedding) * np.linalg.norm(current_embedding)
)
if similarity < threshold:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
else:
current_chunk.append(sentence)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
재귀적 청킹 (Recursive Chunking)
계층적으로 청킹을 수행합니다:
def recursive_chunking(text, max_chunk_size=512):
if len(text) <= max_chunk_size:
return [text]
# 문장 단위로 분할
sentences = text.split('. ')
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence) + 1 # +1 for space
if current_length + sentence_length <= max_chunk_size:
current_chunk.append(sentence)
current_length += sentence_length
else:
if current_chunk:
chunks.append('. '.join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
if current_chunk:
chunks.append('. '.join(current_chunk))
return chunks
3. 임베딩 모델 선택 및 비교
다양한 임베딩 모델을 비교하여 최적의 선택을 도와줍니다:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
class EmbeddingBenchmark:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'sentence-t5-xxl': SentenceTransformer('sentence-t5-xxl')
}
def benchmark_models(self, test_sentences):
results = {}
for name, model in self.models.items():
embeddings = model.encode(test_sentences)
avg_time = self._time_encoding(model, test_sentences)
results[name] = {
'avg_time': avg_time,
'embedding_size': embeddings.shape[1],
'memory_usage': embeddings.nbytes
}
return results
def _time_encoding(self, model, sentences):
import time
start = time.time()
model.encode(sentences)
return time.time() - start
# 사용 예시
benchmark = EmbeddingBenchmark()
test_data = ["Hello world", "Machine learning is powerful", "Natural language processing"]
results = benchmark.benchmark_models(test_data)
print(results)
4. 벡터 데이터베이스 비교: Chroma vs Qdrant vs pgvector vs Milvus
각 벡터 데이터베이스의 장단점을 비교합니다:
# Chroma (간단한 로컬 벡터 DB)
import chromadb
from chromadb import Client
class ChromaDB:
def __init__(self, path="chroma_db"):
self.client = Client(path)
self.collection = self.client.get_or_create_collection("documents")
def add_documents(self, documents, embeddings):
self.collection.add(
documents=documents,
embeddings=embeddings,
ids=[str(i) for i in range(len(documents))]
)
def search(self, query_embedding, top_k=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k
)
return results['documents'][0]
# Qdrant (고성능 분산 벡터 DB)
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantDB:
def __init__(self, host="localhost", port=6333):
self.client = QdrantClient(host=host, port=port)
self.collection_name = "documents"
def create_collection(self):
self.client.create_collection(
collection_name=self.collection_name,
vectors_config={"size": 384, "distance": "Cosine"}
)
def add_documents(self, documents, embeddings):
points = [
{
"id": i,
"vector": embedding,
"payload": {"text": doc}
}
for i, (doc, embedding) in enumerate(zip(documents, embeddings))
]
self.client.upsert(self.collection_name, points)
def search(self, query_embedding, top_k=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding,
limit=top_k
)
return [hit.payload['text'] for hit in results]
# pgvector (PostgreSQL 확장)
import psycopg2
import numpy as np
class PostgresVectorDB:
def __init__(self, connection_string):
self.conn = psycopg2.connect(connection_string)
self._setup_table()
def _setup_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
text TEXT,
embedding VECTOR(384)
)
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON documents USING ivfflat (embedding vector_cosine_ops)")
self.conn.commit()
def add_documents(self, documents, embeddings):
with self.conn.cursor() as cur:
for doc, embedding in zip(documents, embeddings):
cur.execute(
"INSERT INTO documents (text, embedding) VALUES (%s, %s)",
(doc, embedding.tolist())
)
self.conn.commit()
def search(self, query_embedding, top_k=5):
with self.conn.cursor() as cur:
cur.execute("""
SELECT text FROM documents
ORDER BY embedding <-> %s
LIMIT %s
""", (query_embedding.tolist(), top_k))
return [row[0] for row in cur.fetchall()]
5. 전체 RAG 파이프라인 코드
다음은 완전한 RAG 파이프라인 구현입니다:
python
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np
class FullRAGPipeline:
def __init__(self, embedding_model_name="all-MiniLM-L6-v2"):
self.embedding_model = SentenceTransformer(embedding_model_name)
self.chroma_client = Client()
self.collection = self.chroma_client.get_or_create_collection("documents")
self.chunk_size = 512
self.overlap = 50
def chunk_document(self, text):
"""문서 청킹"""
# 간단한 재귀적 청킹
sentences = text.split('. ')
chunks = []
current_chunk = []
current_length = 0
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)