DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v31)

RAG 시스템 실전 구축 (v31)

개발자를 위한 실용적인 RAG 시스템 구축 가이드

1. RAG 기초 개념: 검색 → 보완 → 생성 루프

RAG (Retrieval-Augmented Generation)는 대규모 언어 모델(LLM)의 정보 제한을 극복하기 위한 아키텍처입니다. 다음의 3단계로 구성됩니다:

  1. 검색 (Retrieval): 질문과 관련된 문서 조각을 벡터 데이터베이스에서 찾습니다.
  2. 보완 (Augmentation): 검색된 문서를 프롬프트에 포함시켜 LLM이 더 정확한 응답을 생성할 수 있도록 합니다.
  3. 생성 (Generation): LLM은 보완된 프롬프트를 기반으로 최종 응답을 생성합니다.
# 기본 RAG 루프 구현
class BasicRAG:
    def __init__(self, embedding_model, vector_db, llm):
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.llm = llm

    def retrieve_and_generate(self, query):
        # 1. 질문 임베딩
        query_embedding = self.embedding_model.encode([query])

        # 2. 문서 검색
        relevant_docs = self.vector_db.search(query_embedding, k=5)

        # 3. 프롬프트 구성
        context = "\n".join([doc['text'] for doc in relevant_docs])
        prompt = f"질문: {query}\n문맥: {context}\n답변:"

        # 4. 생성
        response = self.llm.generate(prompt)
        return response
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략: 의미적, 재귀적, 에이전트 기반

2.1 의미적 청킹 (Semantic Chunking)

문맥을 고려한 의미 단위로 분할합니다.

from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np

class SemanticChunker:
    def __init__(self, embedding_model, chunk_size=500, chunk_overlap=50):
        self.embedding_model = embedding_model
        self.chunker = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )

    def chunk_document(self, text):
        chunks = self.chunker.split_text(text)
        return chunks

# 사용 예시
chunker = SemanticChunker(embedding_model)
chunks = chunker.chunk_document("문서 내용...")
Enter fullscreen mode Exit fullscreen mode

2.2 재귀적 청킹 (Recursive Chunking)

문서 구조를 고려하여 여러 계층으로 분할합니다.

class RecursiveChunker:
    def __init__(self, chunk_sizes=[1000, 500, 250]):
        self.chunk_sizes = chunk_sizes

    def recursive_chunk(self, text):
        chunks = []
        for size in self.chunk_sizes:
            if len(text) > size:
                # size 단위로 재귀적으로 분할
                for i in range(0, len(text), size):
                    chunks.append(text[i:i+size])
        return chunks
Enter fullscreen mode Exit fullscreen mode

2.3 에이전트 기반 청킹 (Agentic Chunking)

문서의 주제별로 자동 분할합니다.

class AgentChunker:
    def __init__(self, topic_model):
        self.topic_model = topic_model

    def chunk_by_topics(self, text, num_topics=3):
        # 토픽 모델링을 통해 문서 분할
        topics = self.topic_model.fit_predict([text])
        # 각 토픽별로 청킹
        return self._create_chunks_by_topic(text, topics)
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택과 비교

3.1 다양한 임베딩 모델 비교

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

class EmbeddingComparison:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
            'gte-small': SentenceTransformer('thenlper/gte-small')
        }

    def compare_models(self, texts, model_names=['all-MiniLM-L6-v2', 'bge-small-en']):
        results = {}
        for name in model_names:
            model = self.models[name]
            embeddings = model.encode(texts)
            results[name] = {
                'embedding_shape': embeddings.shape,
                'avg_norm': np.linalg.norm(embeddings, axis=1).mean()
            }
        return results

# 비교 예시
comparison = EmbeddingComparison()
texts = ["문서 1 내용", "문서 2 내용"]
results = comparison.compare_models(texts)
print(results)
Enter fullscreen mode Exit fullscreen mode

3.2 성능 기준: 속도 vs 정확도

import time

def benchmark_embedding_model(model, texts, iterations=10):
    times = []
    for _ in range(iterations):
        start = time.time()
        embeddings = model.encode(texts)
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)
    return {
        'avg_time': avg_time,
        'throughput': len(texts) / avg_time,
        'embedding_size': embeddings.shape
    }

# 빠른 모델 vs 정확한 모델 비교
fast_model = SentenceTransformer('all-MiniLM-L6-v2')
accurate_model = SentenceTransformer('all-mpnet-base-v2')

fast_benchmark = benchmark_embedding_model(fast_model, texts)
accurate_benchmark = benchmark_embedding_model(accurate_model, texts)
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교: Chroma vs Qdrant vs pgvector vs Milvus

4.1 Chroma (로컬용)

import chromadb
from chromadb import Client

class ChromaVectorDB:
    def __init__(self, path="./chroma_db"):
        self.client = Client(path)
        self.collection = self.client.get_or_create_collection("rag_docs")

    def add_documents(self, documents, embeddings, ids):
        self.collection.add(
            embeddings=embeddings,
            documents=documents,
            ids=ids
        )

    def search(self, query_embedding, k=5):
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=k
        )
        return [{"text": doc, "score": score} 
                for doc, score in zip(results['documents'][0], results['distances'][0])]
Enter fullscreen mode Exit fullscreen mode

4.2 Qdrant

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = "rag_docs"

    def create_collection(self, vector_size=384):
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": vector_size, "distance": "Cosine"}
        )

    def search(self, query_vector, k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=k
        )
        return [{"id": result.id, "score": result.score, "payload": result.payload}
                for result in results]
Enter fullscreen mode Exit fullscreen mode

4.3 pgvector (PostgreSQL 확장)

import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id UUID PRIMARY KEY,
                    text TEXT,
                    embedding VECTOR(384),
                    metadata JSONB
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents 
                USING ivfflat (embedding vector_cosine_ops)
            """)
        self.conn.commit()

    def search(self, query_vector, k=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT id, text, embedding, metadata, 
                       1 - (embedding <-> %s) as similarity
                FROM rag_documents
                ORDER BY similarity DESC
                LIMIT %s
            """, (query_vector, k))
            return cur.fetchall()
Enter fullscreen mode Exit fullscreen mode

4.4 Milvus (분산 벡터 DB)


python
from pymilvus import Collection, FieldSchema, DataType, connections

class MilvusVectorDB:
    def __init__(self, host="localhost", port=19530):
        connections.connect("default", host=host, port=

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)