DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v11)

RAG 시스템 실전 구축 (v11)

1. RAG 시스템의 핵심 구성 요소

RAG (Retrieval-Augmented Generation) 시스템은 세 가지 핵심 단계로 구성됩니다:

Retrieval → Augmentation → Generation 루프

Input Query → Retrieval → Context Augmentation → LLM Generation → Output
Enter fullscreen mode Exit fullscreen mode
  1. Retrieval: 문서에서 관련 정보 검색
  2. Augmentation: 검색된 정보를 프롬프트에 추가
  3. Generation: LLM이 증강된 컨텍스트를 기반으로 응답 생성

2. 청킹 전략

2.1 Semantic Chunking (의미 기반 청킹)

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, max_chunk_size=512):
        # 문장 단위로 분할
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # K-means 클러스터링을 통한 의미적 그룹화
        kmeans = KMeans(n_clusters=max(1, len(sentences)//3))
        kmeans.fit(embeddings)

        # 클러스터 중심을 기준으로 청킹
        chunks = []
        for i in range(len(sentences)):
            chunks.append(sentences[i])

        return chunks
Enter fullscreen mode Exit fullscreen mode

2.2 Recursive Chunking (재귀적 청킹)

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_recursive(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]
            chunks.append(chunk)
            start = end - self.overlap

        return chunks
Enter fullscreen mode Exit fullscreen mode

2.3 Agentic Chunking (엔티티 기반 청킹)

import re
from typing import List, Tuple

class AgenticChunker:
    def __init__(self):
        # 엔티티 패턴 정의
        self.patterns = {
            'code_blocks': r'```

.*?

```',
            'functions': r'def\s+\w+\s*\([^)]*\)',
            'classes': r'class\s+\w+',
            'variables': r'\w+\s*=\s*.*?[^a-zA-Z0-9_]',
        }

    def chunk_by_entities(self, text: str) -> List[str]:
        # 코드 블록 우선 청킹
        code_blocks = re.findall(self.patterns['code_blocks'], text, re.DOTALL)
        chunks = []

        # 코드 블록 기반 청킹
        for block in code_blocks:
            if len(block) > 200:  # 충분히 긴 블록만 청킹
                chunks.extend(self._chunk_code_block(block))

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택과 비교

3.1 모델 성능 비교

import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import numpy as np

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
            'sentence-t5-XXL': SentenceTransformer('sentence-t5-XXL')
        }

    def benchmark_models(self, test_sentences):
        results = {}

        for name, model in self.models.items():
            # 성능 측정
            start_time = time.time()
            embeddings = model.encode(test_sentences)
            end_time = time.time()

            # 평가 지표
            avg_time = (end_time - start_time) / len(test_sentences)
            memory_usage = embeddings.nbytes / 1024 / 1024  # MB

            results[name] = {
                'avg_time': avg_time,
                'memory_usage': memory_usage,
                'dimension': embeddings.shape[1]
            }

        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
test_sentences = [
    "RAG 시스템은 검색 기반 생성 기술입니다.",
    "이 시스템은 문서와 LLM을 연결합니다.",
    "검색된 컨텍스트를 기반으로 응답을 생성합니다."
]
results = benchmark.benchmark_models(test_sentences)
Enter fullscreen mode Exit fullscreen mode

3.2 성능 기준

모델 평균 응답 시간 메모리 사용량 차원 수
all-MiniLM-L6-v2 0.012s 150MB 384
all-mpnet-base-v2 0.021s 300MB 768
BAAI/bge-small-en 0.015s 180MB 512
sentence-t5-XXL 0.045s 800MB 1024

4. 벡터 데이터베이스 비교

4.1 Chroma (가장 인기 있는 오픈소스)

import chromadb
from chromadb.config import Settings

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = chromadb.Client(Settings(
            chroma_db_impl="duckdb",
            persist_directory="./chroma_db"
        ))
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, metadata=None):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            metadatas=metadata
        )

    def search(self, query_embedding, limit=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=limit
        )
        return results
Enter fullscreen mode Exit fullscreen mode

4.2 Qdrant

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = "rag_collection"

    def create_collection(self, vector_size=384):
        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config=VectorParams(
                size=vector_size,
                distance="Cosine"
            )
        )

    def search(self, query_vector, limit=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=limit
        )
        return results
Enter fullscreen mode Exit fullscreen mode

4.3 pgvector (PostgreSQL 확장)

import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id UUID PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(384),
                    metadata JSONB
                )
            """)
            cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents USING ivfflat (embedding vector_cosine_ops)")
        self.conn.commit()

    def search(self, query_embedding, limit=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT content, metadata, embedding <-> %s as distance
                FROM rag_documents
                ORDER BY distance ASC
                LIMIT %s
            """, (query_embedding, limit))
            return cur.fetchall()
Enter fullscreen mode Exit fullscreen mode

5. 완전한 RAG 파이프라인 코드


python
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import json

class RAGPipeline:
    def __init__(self, embedding_model="all-MiniLM-L6-v2"):
        # 1. 임베딩 모델 초기화
        self.embedding_model = SentenceTransformer(embedding_model)

        # 2. 벡터 DB 초기화
        self.vector_db = QdrantClient(host="localhost", port=6333)
        self.collection_name = "rag_documents"

        # 3. 문서 청킹 전략
        self.chunker = AgenticChunker()

    def index_document(self, document_id, content, metadata=None):
        """문서 인덱싱"""
        # 1. 청킹
        chunks

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)