DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v23)

RAG 시스템 실전 구축 (v23)

Real-world RAG Implementation Guide for ML Engineers

1. RAG 기본 개념: 검색 → 보강 → 생성 루프

RAG (Retrieval-Augmented Generation) 시스템은 정보 검색과 언어 생성을 결합하여 정확한 답변을 생성하는 아키텍처입니다.

RAG 루프 구조:

  1. 검색 (Retrieval): 질문에 관련된 문서 조각 검색
  2. 보강 (Augmentation): 검색된 문서를 프롬프트에 추가
  3. 생성 (Generation): LLM이 보강된 프롬프트를 기반으로 답변 생성

2. Chunking 전략 비교

2.1 Semantic Chunking (권장)

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, max_chunk_size=512):
        # 텍스트를 문장 단위로 분할
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # 임베딩 기반 유사도를 사용하여 chunk 분할
        chunks = []
        current_chunk = []
        current_length = 0

        for i, (sentence, emb) in enumerate(zip(sentences, embeddings)):
            if current_length + len(sentence) > max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = len(sentence)
            else:
                current_chunk.append(sentence)
                current_length += len(sentence)

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks

# 사용 예시
chunker = SemanticChunker()
text = "RAG 시스템은 검색과 생성을 결합한 아키텍처입니다. 이 시스템은 기존 문서에서 관련 정보를 검색하고, 그 정보를 기반으로 정확한 답변을 생성합니다."
chunks = chunker.chunk_by_semantic(text)
Enter fullscreen mode Exit fullscreen mode

2.2 Recursive Chunking

import re

class RecursiveChunker:
    def __init__(self, max_chunk_size=512):
        self.max_chunk_size = max_chunk_size

    def chunk_recursive(self, text):
        # 헤딩 단위로 분할
        headings = re.findall(r'^(#{1,6})\s+(.*)', text, re.MULTILINE)
        chunks = []

        # 각 heading의 내용을 chunk로 분할
        for i, (level, title) in enumerate(headings):
            start = text.find(title)
            end = text.find('\n\n', start) if i < len(headings) - 1 else len(text)
            chunk_content = text[start:end].strip()

            if len(chunk_content) > self.max_chunk_size:
                # 재귀적으로 chunk 분할
                sub_chunks = self._split_chunk(chunk_content)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk_content)

        return chunks

    def _split_chunk(self, chunk_text):
        # 긴 chunk를 최대 길이로 나눔
        chunks = []
        while len(chunk_text) > self.max_chunk_size:
            split_point = chunk_text.rfind(' ', 0, self.max_chunk_size)
            if split_point == -1:
                split_point = self.max_chunk_size
            chunks.append(chunk_text[:split_point])
            chunk_text = chunk_text[split_point:].lstrip()
        chunks.append(chunk_text)
        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택과 비교

3.1 모델 성능 비교 스크립트

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'sentence-t5-base': SentenceTransformer('sentence-t5-base')
        }

    def compare_models(self, query, documents):
        results = {}

        for name, model in self.models.items():
            # 임베딩 생성
            start_time = time.time()
            query_embedding = model.encode([query])
            doc_embeddings = model.encode(documents)
            end_time = time.time()

            # 유사도 계산
            similarities = cosine_similarity(query_embedding, doc_embeddings)[0]

            results[name] = {
                'time': end_time - start_time,
                'similarities': similarities,
                'top_matches': np.argsort(similarities)[-3:][::-1]
            }

        return results

# 테스트
benchmark = EmbeddingBenchmark()
documents = [
    "RAG 시스템은 검색과 생성을 결합한 아키텍처입니다.",
    "대규모 언어 모델은 자연어 처리에서 중요한 역할을 합니다.",
    "임베딩 모델은 문장을 벡터로 변환합니다."
]
results = benchmark.compare_models("RAG 시스템과 임베딩 모델", documents)
Enter fullscreen mode Exit fullscreen mode

3.2 추천 설정

  • Production: all-mpnet-base-v2 (정확성과 성능의 균형)
  • Dev: all-MiniLM-L6-v2 (속도 우선)
  • Local: sentence-t5-base (작은 크기로 로컬 실행 가능)

4. Vector Database 비교

4.1 Chroma vs Qdrant vs pgvector vs Milvus

# Chroma 설정
import chromadb
from chromadb import Client

class ChromaDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = Client()
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, metadatas=None):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            metadatas=metadatas
        )

    def search(self, query_embedding, limit=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=limit
        )
        return results

# Qdrant 설정
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantDB:
    def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

    def create_collection(self, vector_size=768):
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": vector_size, "distance": "Cosine"}
        )

    def search(self, query_vector, limit=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=limit
        )
        return results

# pgvector (PostgreSQL 확장)
import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id SERIAL PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(768),
                    metadata JSONB
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents 
                USING ivfflat (embedding vector_cosine_ops)
            """)
        self.conn.commit()

    def search(self, query_embedding, limit=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT content, metadata, 
                       1 - (embedding <-> %s) as similarity
                FROM rag_documents
                ORDER BY similarity DESC
                LIMIT %s
            """, (query_embedding, limit))
            return cur.fetchall()
Enter fullscreen mode Exit fullscreen mode

성능 비교 (100K 문서 기준):

DB 검색 시간 (1000회) 메모리 사용량 설치 복잡도
Chroma 15-20ms 500MB 간단
Qdrant 12-18ms 800MB 간단
pgvector 20-25ms 1GB 복잡
Milvus 8-15ms 1.5GB 복잡

5. 완전한 RAG 파이프라인 코드


python
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from chromadb import Client
import json
from typing import List, Dict, Tuple

class RAGPipeline:
    def __init__(self, 
                 embedding_model_name='all-mpnet-base-v2',
                 vector_db_path='./chroma_db'):
        self.embedding_model = SentenceTransformer(embed

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)