DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v6)

RAG 시스템 실전 구축 (v6)

1. RAG 기초 개념

Retrieval-Augmented Generation (RAG)은 대규모 언어 모델(LLM)의 지식 범위를 확장하는 효율적인 방법입니다. RAG는 다음 세 가지 주요 단계로 구성됩니다:

  1. 검색 (Retrieval): 질문과 관련된 문서 조각을 벡터 데이터베이스에서 검색합니다.
  2. 증강 (Augmentation): 검색된 문서를 프롬프트에 포함하여 LLM의 입력을 향상시킵니다.
  3. 생성 (Generation): 증강된 입력을 기반으로 답변을 생성합니다.
# 간단한 RAG 루프 구현
class SimpleRAG:
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db

    def query(self, question):
        # 1. 질문 임베딩
        query_embedding = self.embedding_model.encode(question)

        # 2. 검색
        relevant_docs = self.vector_db.search(query_embedding, k=5)

        # 3. 증강 및 생성
        context = "\n".join([doc.content for doc in relevant_docs])
        prompt = f"질문: {question}\n문맥: {context}"

        return self.generate_answer(prompt)
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략

2.1 의미적 청킹 (Semantic Chunking)

문서의 의미를 기반으로 청킹하여 의미 단위를 유지합니다.

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_semantic(self, text, max_tokens=512):
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # 문장 간 유사도 기반 그룹화
        chunks = []
        current_chunk = []
        current_embedding = None

        for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
            if not current_chunk:
                current_chunk.append(sentence)
                current_embedding = embedding
            else:
                # 유사도 계산
                similarity = np.dot(current_embedding, embedding) / (
                    np.linalg.norm(current_embedding) * np.linalg.norm(embedding)
                )

                if similarity > 0.8 and len(current_chunk) < 10:
                    current_chunk.append(sentence)
                else:
                    chunks.append('. '.join(current_chunk))
                    current_chunk = [sentence]
                    current_embedding = embedding

        if current_chunk:
            chunks.append('. '.join(current_chunk))

        return chunks
Enter fullscreen mode Exit fullscreen mode

2.2 재귀적 청킹 (Recursive Chunking)

문서를 여러 레벨로 분할하여 다양한 청킹 수준을 제공합니다.

class RecursiveChunker:
    def __init__(self, chunk_size=1024, overlap=128):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_recursive(self, text):
        if len(text) <= self.chunk_size:
            return [text]

        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))

            # 오버랩 처리
            if start > 0:
                overlap_start = max(0, start - self.overlap)
                chunk_content = text[overlap_start:end]
            else:
                chunk_content = text[start:end]

            chunks.append(chunk_content)
            start = end - self.overlap

        return chunks
Enter fullscreen mode Exit fullscreen mode

2.3 에이전트 기반 청킹 (Agentic Chunking)

문서의 구조적 특징을 고려한 청킹입니다.

import re

class AgenticChunker:
    def __init__(self):
        self.section_patterns = [
            r'##\s+(.+)',
            r'#\s+(.+)',
            r'\*\*\s+(.+)\s+\*\*'
        ]

    def chunk_by_structure(self, text):
        # 섹션 기준으로 분할
        sections = re.split(r'(\n## |\n# )', text)

        chunks = []
        current_chunk = ""

        for i, part in enumerate(sections):
            if i % 2 == 1:  # 섹션 헤더
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = part
            else:  # 섹션 내용
                current_chunk += part

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

3.1 모델 비교

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            "all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2"),
            "all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2"),
            "sentence-t5-xxl": SentenceTransformer("sentence-t5-xxl")
        }

    def compare_models(self, texts, benchmark_dataset):
        results = {}

        for model_name, model in self.models.items():
            # 성능 측정
            start_time = time.time()
            embeddings = model.encode(texts)
            end_time = time.time()

            # 정확도 평가 (가상 데이터셋 사용)
            accuracy = self.evaluate_accuracy(embeddings, benchmark_dataset)

            results[model_name] = {
                "latency": end_time - start_time,
                "accuracy": accuracy,
                "size": model.get_sentence_embedding_dimension()
            }

        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
texts = ["Python은 강력한 프로그래밍 언어입니다.", "Machine Learning은 AI의 한 분야입니다."]
results = benchmark.compare_models(texts, "benchmark_dataset")
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector vs Milvus

# Chroma 구현
import chromadb
from chromadb import Client

class ChromaVectorDB:
    def __init__(self):
        self.client = Client()
        self.collection = self.client.get_or_create_collection("rag_docs")

    def add_documents(self, documents, embeddings):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=[f"doc_{i}" for i in range(len(documents))]
        )

    def search(self, query_embedding, k=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )
        return results["documents"][0]

# Qdrant 구현
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self):
        self.client = QdrantClient(host="localhost", port=6333)
        self.collection_name = "rag_docs"

    def add_documents(self, documents, embeddings):
        self.client.upsert(
            collection_name=self.collection_name,
            points=[
                {
                    "id": i,
                    "vector": embedding,
                    "payload": {"text": doc}
                }
                for i, (doc, embedding) in enumerate(zip(documents, embeddings))
            ]
        )

    def search(self, query_embedding, k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=k
        )
        return [hit.payload["text"] for hit in results]

# pgvector 구현
import psycopg2
from psycopg2.extras import Json

class PgVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id SERIAL PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(768)
                )
            """)
            self.conn.commit()

    def add_documents(self, documents, embeddings):
        with self.conn.cursor() as cur:
            for doc, embedding in zip(documents, embeddings):
                cur.execute(
                    "INSERT INTO rag_documents (content, embedding) VALUES (%s, %s)",
                    (doc, embedding)
                )
        self.conn.commit()

    def search(self, query_embedding, k=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT content FROM rag_documents 
                ORDER BY embedding <#> %s 
                LIMIT %s
            """, (query_embedding, k))
            return [row[0] for row in cur.fetchall()]
Enter fullscreen mode Exit fullscreen mode

4.2 성능 비교


python
import time

def benchmark_vector_dbs(documents, embeddings, queries):
    dbs = {
        "Chroma": ChromaVectorDB(),
        "Qdrant": QdrantVectorDB(),
        "pgvector": PgVectorDB("postgresql://user:pass@

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)