matias yoon

Posted on May 25

RAG 시스템 실전 구축 (v23)

#ai #llm #developers #tutorial

RAG 시스템 실전 구축 (v23)

Real-world RAG Implementation Guide for ML Engineers

1. RAG 기본 개념: 검색 → 보강 → 생성 루프

RAG (Retrieval-Augmented Generation) 시스템은 정보 검색과 언어 생성을 결합하여 정확한 답변을 생성하는 아키텍처입니다.

RAG 루프 구조:

검색 (Retrieval): 질문에 관련된 문서 조각 검색
보강 (Augmentation): 검색된 문서를 프롬프트에 추가
생성 (Generation): LLM이 보강된 프롬프트를 기반으로 답변 생성

2. Chunking 전략 비교

2.1 Semantic Chunking (권장)

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, max_chunk_size=512):
        # 텍스트를 문장 단위로 분할
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # 임베딩 기반 유사도를 사용하여 chunk 분할
        chunks = []
        current_chunk = []
        current_length = 0

        for i, (sentence, emb) in enumerate(zip(sentences, embeddings)):
            if current_length + len(sentence) > max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = len(sentence)
            else:
                current_chunk.append(sentence)
                current_length += len(sentence)

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks

# 사용 예시
chunker = SemanticChunker()
text = "RAG 시스템은 검색과 생성을 결합한 아키텍처입니다. 이 시스템은 기존 문서에서 관련 정보를 검색하고, 그 정보를 기반으로 정확한 답변을 생성합니다."
chunks = chunker.chunk_by_semantic(text)

2.2 Recursive Chunking

import re

class RecursiveChunker:
    def __init__(self, max_chunk_size=512):
        self.max_chunk_size = max_chunk_size

    def chunk_recursive(self, text):
        # 헤딩 단위로 분할
        headings = re.findall(r'^(#{1,6})\s+(.*)', text, re.MULTILINE)
        chunks = []

        # 각 heading의 내용을 chunk로 분할
        for i, (level, title) in enumerate(headings):
            start = text.find(title)
            end = text.find('\n\n', start) if i < len(headings) - 1 else len(text)
            chunk_content = text[start:end].strip()

            if len(chunk_content) > self.max_chunk_size:
                # 재귀적으로 chunk 분할
                sub_chunks = self._split_chunk(chunk_content)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk_content)

        return chunks

    def _split_chunk(self, chunk_text):
        # 긴 chunk를 최대 길이로 나눔
        chunks = []
        while len(chunk_text) > self.max_chunk_size:
            split_point = chunk_text.rfind(' ', 0, self.max_chunk_size)
            if split_point == -1:
                split_point = self.max_chunk_size
            chunks.append(chunk_text[:split_point])
            chunk_text = chunk_text[split_point:].lstrip()
        chunks.append(chunk_text)
        return chunks

3. 임베딩 모델 선택과 비교

3.1 모델 성능 비교 스크립트

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'sentence-t5-base': SentenceTransformer('sentence-t5-base')
        }

    def compare_models(self, query, documents):
        results = {}

        for name, model in self.models.items():
            # 임베딩 생성
            start_time = time.time()
            query_embedding = model.encode([query])
            doc_embeddings = model.encode(documents)
            end_time = time.time()

            # 유사도 계산
            similarities = cosine_similarity(query_embedding, doc_embeddings)[0]

            results[name] = {
                'time': end_time - start_time,
                'similarities': similarities,
                'top_matches': np.argsort(similarities)[-3:][::-1]
            }

        return results

# 테스트
benchmark = EmbeddingBenchmark()
documents = [
    "RAG 시스템은 검색과 생성을 결합한 아키텍처입니다.",
    "대규모 언어 모델은 자연어 처리에서 중요한 역할을 합니다.",
    "임베딩 모델은 문장을 벡터로 변환합니다."
]
results = benchmark.compare_models("RAG 시스템과 임베딩 모델", documents)

3.2 추천 설정

Production: all-mpnet-base-v2 (정확성과 성능의 균형)
Dev: all-MiniLM-L6-v2 (속도 우선)
Local: sentence-t5-base (작은 크기로 로컬 실행 가능)

4. Vector Database 비교

4.1 Chroma vs Qdrant vs pgvector vs Milvus

# Chroma 설정
import chromadb
from chromadb import Client

class ChromaDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = Client()
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, metadatas=None):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            metadatas=metadatas
        )

    def search(self, query_embedding, limit=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=limit
        )
        return results

# Qdrant 설정
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantDB:
    def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

    def create_collection(self, vector_size=768):
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": vector_size, "distance": "Cosine"}
        )

    def search(self, query_vector, limit=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=limit
        )
        return results

# pgvector (PostgreSQL 확장)
import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id SERIAL PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(768),
                    metadata JSONB
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents 
                USING ivfflat (embedding vector_cosine_ops)
            """)
        self.conn.commit()

    def search(self, query_embedding, limit=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT content, metadata, 
                       1 - (embedding <-> %s) as similarity
                FROM rag_documents
                ORDER BY similarity DESC
                LIMIT %s
            """, (query_embedding, limit))
            return cur.fetchall()

성능 비교 (100K 문서 기준):

DB	검색 시간 (1000회)	메모리 사용량	설치 복잡도
Chroma	15-20ms	500MB	간단
Qdrant	12-18ms	800MB	간단
pgvector	20-25ms	1GB	복잡
Milvus	8-15ms	1.5GB	복잡

5. 완전한 RAG 파이프라인 코드


python
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from chromadb import Client
import json
from typing import List, Dict, Tuple

class RAGPipeline:
    def __init__(self, 
                 embedding_model_name='all-mpnet-base-v2',
                 vector_db_path='./chroma_db'):
        self.embedding_model = SentenceTransformer(embed

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)

DEV Community

RAG 시스템 실전 구축 (v23)

RAG 시스템 실전 구축 (v23)

1. RAG 기본 개념: 검색 → 보강 → 생성 루프

2. Chunking 전략 비교

2.1 Semantic Chunking (권장)

2.2 Recursive Chunking

3. 임베딩 모델 선택과 비교

3.1 모델 성능 비교 스크립트

3.2 추천 설정

4. Vector Database 비교

4.1 Chroma vs Qdrant vs pgvector vs Milvus

5. 완전한 RAG 파이프라인 코드

Top comments (0)