DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v26)

RAG 시스템 실전 구축 (v26)

개요

이 가이드는 실전에서 RAG(Retrieval-Augmented Generation) 시스템을 구축하는 데 필요한 모든 단계를 다룹니다. 개발자들은 단순한 RAG 시스템을 구현하는 것에서 벗어나 실제 운영 환경에서의 성능, 비용, 유지보수를 고려한 완전한 솔루션을 만들고자 합니다.

1. RAG 기초: 검색 → 보완 → 생성 루프

RAG 시스템은 세 가지 핵심 단계로 구성됩니다:

  1. 검색(Retrieval): 사용자의 질문과 관련된 문서 조각들을 데이터베이스에서 찾습니다.
  2. 보완(Augmentation): 검색된 문서를 프롬프트에 추가하여 생성 모델이 더 많은 컨텍스트를 갖도록 합니다.
  3. 생성(Generation): 보완된 프롬프트를 기반으로 답변을 생성합니다.
# 기본 RAG 루프 구현
class BasicRAG:
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db

    def retrieve(self, query, top_k=5):
        query_embedding = self.embedding_model.encode([query])
        return self.vector_db.search(query_embedding, top_k)

    def generate(self, query, context):
        prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
        return self.llm.generate(prompt)

    def run(self, query):
        context = self.retrieve(query)
        return self.generate(query, context)
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략: 의미적, 재귀적, 에이전트 기반

청킹 전략은 문서를 모델이 이해할 수 있는 단위로 분할하는 방법입니다.

의미적 청킹 (Semantic Chunking)

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def chunk(self, text, max_chunk_size=512):
        sentences = text.split('.')
        embeddings = self.model.encode(sentences)

        # 문장 간 거리 계산 후 의미 있는 단위로 분할
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence, embedding in zip(sentences, embeddings):
            if current_length + len(sentence.split()) > max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = len(sentence.split())
            else:
                current_chunk.append(sentence)
                current_length += len(sentence.split())

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks
Enter fullscreen mode Exit fullscreen mode

재귀적 청킹 (Recursive Chunking)

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]

            # 오버랩 추가
            if start > 0:
                overlap_start = max(0, start - self.overlap)
                chunk = text[overlap_start:end]

            chunks.append(chunk)
            start = end - self.overlap

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택과 비교

모델 비교 테스트

import time
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'sentence-tiny': SentenceTransformer('sentence-tiny')
        }

    def benchmark(self, texts):
        results = {}
        for name, model in self.models.items():
            start_time = time.time()
            embeddings = model.encode(texts)
            end_time = time.time()

            results[name] = {
                'time': end_time - start_time,
                'memory': len(embeddings) * len(embeddings[0]) * 4  # float32
            }
        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
test_texts = ["This is test text 1", "This is test text 2", "Another sample text"]
results = benchmark.benchmark(test_texts)
Enter fullscreen mode Exit fullscreen mode

성능 기준 정리

  • all-MiniLM-L6-v2: 최적의 균형 (256차원, 50MB)
  • all-mpnet-base-v2: 높은 정확도 (768차원, 100MB)
  • sentence-tiny: 빠른 속도 (128차원, 20MB)

4. 벡터 데이터베이스 비교

Chroma vs Qdrant vs pgvector vs Milvus

# Chroma 예제
import chromadb
from chromadb.utils import embedding_functions

class ChromaRAG:
    def __init__(self):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name="rag_collection",
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )

    def add_documents(self, documents, ids):
        self.collection.add(documents=documents, ids=ids)

    def search(self, query, top_k=5):
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k
        )
        return results['documents'][0]

# Qdrant 예제
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantRAG:
    def __init__(self):
        self.client = QdrantClient(host="localhost", port=6333)
        self.collection_name = "rag_collection"

    def create_collection(self):
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": 384, "distance": "Cosine"}
        )

    def search(self, query, top_k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query,
            limit=top_k
        )
        return [hit.payload['text'] for hit in results]

# pgvector 예제
import psycopg2
from psycopg2.extras import RealDictCursor

class PGVectorRAG:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)

    def search(self, query_embedding, top_k=5):
        with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
            cursor.execute("""
                SELECT text, 1 - (embedding <-> %s) as similarity
                FROM documents
                ORDER BY similarity DESC
                LIMIT %s
            """, (query_embedding, top_k))
            return cursor.fetchall()
Enter fullscreen mode Exit fullscreen mode

성능 비교

데이터베이스 성능 메모리 사용량 설치 복잡도
Chroma 빠름 낮음 쉬움
Qdrant 빠름 중간 중간
pgvector 중간 높음 어려움
Milvus 빠름 높음 어려움

5. 전체 RAG 파이프라인 코드


python
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import logging

class CompleteRAGPipeline:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(model_name)
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name="documents",
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )
        self.logger = logging.getLogger(__name__)

    def setup(self, documents, ids):
        """문서 추가 및 임베딩"""
        try:
            embeddings = self.embedding_model.encode(documents)
            self.collection.add(
                documents=documents,
                embeddings=embeddings.tolist(),
                ids=ids
            )
            self.logger.info(f"Added {len(documents)} documents")
        except Exception as e:
            self.logger.error(f"Error adding documents: {e}")

    def retrieve_context(self, query, top_k=5):
        """문서 검색"""
        query_embedding = self.embedding_model.encode([query])
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k,
            include=['documents', 'distances']
        )
        return results['documents'][0], results['distances'][0]

    def generate_answer(self, query, context):
        """답변 생성 (모델 사용)"""
        prompt = f"""
        사용자 질문: {query}


---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)