DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v4)

RAG 시스템 실전 구축 (v4)

1. RAG 기초 개념: 검색 → 보완 → 생성 루프

Retrieval-Augmented Generation (RAG)은 대규모 언어 모델(LLM)의 지식 범위를 확장하기 위한 아키텍처입니다. RAG는 세 가지 핵심 구성 요소로 구성됩니다:

  1. 검색 (Retrieval): 사용자의 질문과 관련된 문서 또는 텍스트 조각을 찾습니다
  2. 보완 (Augmentation): 검색된 정보를 질문과 함께 LLM 입력으로 제공합니다
  3. 생성 (Generation): LLM이 검색된 컨텍스트와 질문을 기반으로 답변을 생성합니다
# 간단한 RAG 루프 구현
class SimpleRAG:
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db

    def retrieve(self, query, k=5):
        query_embedding = self.embedding_model.encode(query)
        return self.vector_db.search(query_embedding, k=k)

    def generate(self, query, retrieved_docs):
        context = "\n".join(retrieved_docs)
        prompt = f"질문: {query}\n컨텍스트: {context}\n답변:"
        return self.llm.generate(prompt)

    def run(self, query):
        docs = self.retrieve(query)
        return self.generate(query, docs)
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략: 의미론적, 재귀적, 에이전트 기반

청킹은 대용량 텍스트를 모델이 처리할 수 있는 조각으로 나누는 과정입니다:

의미론적 청킹 (Semantic Chunking)

문서의 의미적 단위를 기준으로 청킹합니다:

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

def semantic_chunking(text, model, threshold=0.75):
    sentences = text.split('. ')
    embeddings = model.encode(sentences)

    # 유사도 기반 클러스터링
    similarity_matrix = np.dot(embeddings, embeddings.T)
    chunks = []
    current_chunk = []

    for i, sentence in enumerate(sentences):
        if not current_chunk:
            current_chunk.append(sentence)
        else:
            # 가장 최근 문장과의 유사도 계산
            recent_embedding = embeddings[i-1]
            current_embedding = embeddings[i]
            similarity = np.dot(recent_embedding, current_embedding) / (
                np.linalg.norm(recent_embedding) * np.linalg.norm(current_embedding)
            )

            if similarity < threshold:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
            else:
                current_chunk.append(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks
Enter fullscreen mode Exit fullscreen mode

재귀적 청킹 (Recursive Chunking)

계층적으로 청킹을 수행합니다:

def recursive_chunking(text, max_chunk_size=512):
    if len(text) <= max_chunk_size:
        return [text]

    # 문장 단위로 분할
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence) + 1  # +1 for space
        if current_length + sentence_length <= max_chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            if current_chunk:
                chunks.append('. '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append('. '.join(current_chunk))

    return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

다양한 임베딩 모델을 비교하여 최적의 선택을 도와줍니다:

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'sentence-t5-xxl': SentenceTransformer('sentence-t5-xxl')
        }

    def benchmark_models(self, test_sentences):
        results = {}
        for name, model in self.models.items():
            embeddings = model.encode(test_sentences)
            avg_time = self._time_encoding(model, test_sentences)
            results[name] = {
                'avg_time': avg_time,
                'embedding_size': embeddings.shape[1],
                'memory_usage': embeddings.nbytes
            }
        return results

    def _time_encoding(self, model, sentences):
        import time
        start = time.time()
        model.encode(sentences)
        return time.time() - start

# 사용 예시
benchmark = EmbeddingBenchmark()
test_data = ["Hello world", "Machine learning is powerful", "Natural language processing"]
results = benchmark.benchmark_models(test_data)
print(results)
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교: Chroma vs Qdrant vs pgvector vs Milvus

각 벡터 데이터베이스의 장단점을 비교합니다:

# Chroma (간단한 로컬 벡터 DB)
import chromadb
from chromadb import Client

class ChromaDB:
    def __init__(self, path="chroma_db"):
        self.client = Client(path)
        self.collection = self.client.get_or_create_collection("documents")

    def add_documents(self, documents, embeddings):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=[str(i) for i in range(len(documents))]
        )

    def search(self, query_embedding, top_k=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        return results['documents'][0]

# Qdrant (고성능 분산 벡터 DB)
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = "documents"

    def create_collection(self):
        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config={"size": 384, "distance": "Cosine"}
        )

    def add_documents(self, documents, embeddings):
        points = [
            {
                "id": i,
                "vector": embedding,
                "payload": {"text": doc}
            }
            for i, (doc, embedding) in enumerate(zip(documents, embeddings))
        ]
        self.client.upsert(self.collection_name, points)

    def search(self, query_embedding, top_k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=top_k
        )
        return [hit.payload['text'] for hit in results]

# pgvector (PostgreSQL 확장)
import psycopg2
import numpy as np

class PostgresVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self._setup_table()

    def _setup_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS documents (
                    id SERIAL PRIMARY KEY,
                    text TEXT,
                    embedding VECTOR(384)
                )
            """)
            cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON documents USING ivfflat (embedding vector_cosine_ops)")
        self.conn.commit()

    def add_documents(self, documents, embeddings):
        with self.conn.cursor() as cur:
            for doc, embedding in zip(documents, embeddings):
                cur.execute(
                    "INSERT INTO documents (text, embedding) VALUES (%s, %s)",
                    (doc, embedding.tolist())
                )
        self.conn.commit()

    def search(self, query_embedding, top_k=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT text FROM documents 
                ORDER BY embedding <-> %s 
                LIMIT %s
            """, (query_embedding.tolist(), top_k))
            return [row[0] for row in cur.fetchall()]
Enter fullscreen mode Exit fullscreen mode

5. 전체 RAG 파이프라인 코드

다음은 완전한 RAG 파이프라인 구현입니다:


python
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np

class FullRAGPipeline:
    def __init__(self, embedding_model_name="all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.chroma_client = Client()
        self.collection = self.chroma_client.get_or_create_collection("documents")
        self.chunk_size = 512
        self.overlap = 50

    def chunk_document(self, text):
        """문서 청킹"""
        # 간단한 재귀적 청킹
        sentences = text.split('. ')
        chunks = []
        current_chunk = []
        current_length = 0



---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)