RAG 시스템 실전 구축 (v11)
1. RAG 시스템의 핵심 구성 요소
RAG (Retrieval-Augmented Generation) 시스템은 세 가지 핵심 단계로 구성됩니다:
Retrieval → Augmentation → Generation 루프
Input Query → Retrieval → Context Augmentation → LLM Generation → Output
- Retrieval: 문서에서 관련 정보 검색
- Augmentation: 검색된 정보를 프롬프트에 추가
- Generation: LLM이 증강된 컨텍스트를 기반으로 응답 생성
2. 청킹 전략
2.1 Semantic Chunking (의미 기반 청킹)
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def chunk_by_semantic(self, text, max_chunk_size=512):
# 문장 단위로 분할
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
# K-means 클러스터링을 통한 의미적 그룹화
kmeans = KMeans(n_clusters=max(1, len(sentences)//3))
kmeans.fit(embeddings)
# 클러스터 중심을 기준으로 청킹
chunks = []
for i in range(len(sentences)):
chunks.append(sentences[i])
return chunks
2.2 Recursive Chunking (재귀적 청킹)
class RecursiveChunker:
def __init__(self, chunk_size=512, overlap=50):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_recursive(self, text):
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
start = end - self.overlap
return chunks
2.3 Agentic Chunking (엔티티 기반 청킹)
import re
from typing import List, Tuple
class AgenticChunker:
def __init__(self):
# 엔티티 패턴 정의
self.patterns = {
'code_blocks': r'```
.*?
```',
'functions': r'def\s+\w+\s*\([^)]*\)',
'classes': r'class\s+\w+',
'variables': r'\w+\s*=\s*.*?[^a-zA-Z0-9_]',
}
def chunk_by_entities(self, text: str) -> List[str]:
# 코드 블록 우선 청킹
code_blocks = re.findall(self.patterns['code_blocks'], text, re.DOTALL)
chunks = []
# 코드 블록 기반 청킹
for block in code_blocks:
if len(block) > 200: # 충분히 긴 블록만 청킹
chunks.extend(self._chunk_code_block(block))
return chunks
3. 임베딩 모델 선택과 비교
3.1 모델 성능 비교
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import numpy as np
class EmbeddingBenchmark:
def __init__(self):
self.models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
'sentence-t5-XXL': SentenceTransformer('sentence-t5-XXL')
}
def benchmark_models(self, test_sentences):
results = {}
for name, model in self.models.items():
# 성능 측정
start_time = time.time()
embeddings = model.encode(test_sentences)
end_time = time.time()
# 평가 지표
avg_time = (end_time - start_time) / len(test_sentences)
memory_usage = embeddings.nbytes / 1024 / 1024 # MB
results[name] = {
'avg_time': avg_time,
'memory_usage': memory_usage,
'dimension': embeddings.shape[1]
}
return results
# 사용 예시
benchmark = EmbeddingBenchmark()
test_sentences = [
"RAG 시스템은 검색 기반 생성 기술입니다.",
"이 시스템은 문서와 LLM을 연결합니다.",
"검색된 컨텍스트를 기반으로 응답을 생성합니다."
]
results = benchmark.benchmark_models(test_sentences)
3.2 성능 기준
| 모델 | 평균 응답 시간 | 메모리 사용량 | 차원 수 |
|---|---|---|---|
| all-MiniLM-L6-v2 | 0.012s | 150MB | 384 |
| all-mpnet-base-v2 | 0.021s | 300MB | 768 |
| BAAI/bge-small-en | 0.015s | 180MB | 512 |
| sentence-t5-XXL | 0.045s | 800MB | 1024 |
4. 벡터 데이터베이스 비교
4.1 Chroma (가장 인기 있는 오픈소스)
import chromadb
from chromadb.config import Settings
class ChromaVectorDB:
def __init__(self, collection_name="rag_collection"):
self.client = chromadb.Client(Settings(
chroma_db_impl="duckdb",
persist_directory="./chroma_db"
))
self.collection = self.client.get_or_create_collection(collection_name)
def add_documents(self, documents, embeddings, metadata=None):
self.collection.add(
documents=documents,
embeddings=embeddings,
metadatas=metadata
)
def search(self, query_embedding, limit=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=limit
)
return results
4.2 Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Filter, FieldCondition, MatchValue
class QdrantVectorDB:
def __init__(self, host="localhost", port=6333):
self.client = QdrantClient(host=host, port=port)
self.collection_name = "rag_collection"
def create_collection(self, vector_size=384):
self.client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=vector_size,
distance="Cosine"
)
)
def search(self, query_vector, limit=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=limit
)
return results
4.3 pgvector (PostgreSQL 확장)
import psycopg2
from psycopg2.extras import Json
class PGVectorDB:
def __init__(self, connection_string):
self.conn = psycopg2.connect(connection_string)
self.create_table()
def create_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS rag_documents (
id UUID PRIMARY KEY,
content TEXT,
embedding VECTOR(384),
metadata JSONB
)
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents USING ivfflat (embedding vector_cosine_ops)")
self.conn.commit()
def search(self, query_embedding, limit=5):
with self.conn.cursor() as cur:
cur.execute("""
SELECT content, metadata, embedding <-> %s as distance
FROM rag_documents
ORDER BY distance ASC
LIMIT %s
""", (query_embedding, limit))
return cur.fetchall()
5. 완전한 RAG 파이프라인 코드
python
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import json
class RAGPipeline:
def __init__(self, embedding_model="all-MiniLM-L6-v2"):
# 1. 임베딩 모델 초기화
self.embedding_model = SentenceTransformer(embedding_model)
# 2. 벡터 DB 초기화
self.vector_db = QdrantClient(host="localhost", port=6333)
self.collection_name = "rag_documents"
# 3. 문서 청킹 전략
self.chunker = AgenticChunker()
def index_document(self, document_id, content, metadata=None):
"""문서 인덱싱"""
# 1. 청킹
chunks
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Top comments (0)