DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v30)

RAG 시스템 실전 구축 (v30)

1. RAG 시스템 기본 구조

RAG (Retrieval-Augmented Generation) 시스템은 검색 기반의 생성 모델로, 대규모 언어 모델(LLM)이 외부 지식을 활용해 더 정확하고 최신 정보를 생성할 수 있게 해줍니다.

핵심 루프 구성

# RAG 기본 루프 구현
def rag_pipeline(query, vector_db, llm):
    # 1. 검색 (Retrieval)
    retrieved_docs = vector_db.search(query, k=5)

    # 2. 보완 (Augmentation)
    context = format_context(retrieved_docs)
    augmented_query = f"Context: {context}\n\nQuestion: {query}"

    # 3. 생성 (Generation)
    response = llm.generate(augmented_query)
    return response
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략 비교

2.1 의미적 청킹 (Semantic Chunking)

from sentence_transformers import SentenceTransformer
import numpy as np

def semantic_chunking(text, model, threshold=0.7):
    """의미 기반 청킹 - 문장 단위로 의미를 기준으로 분할"""
    sentences = text.split('. ')
    embeddings = model.encode(sentences)

    chunks = []
    current_chunk = []
    current_embedding = None

    for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
        if current_embedding is None:
            current_chunk.append(sentence)
            current_embedding = embedding
        else:
            similarity = cosine_similarity(current_embedding, embedding)
            if similarity > threshold:
                current_chunk.append(sentence)
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_embedding = embedding

    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks
Enter fullscreen mode Exit fullscreen mode

2.2 재귀적 청킹 (Recursive Chunking)

def recursive_chunking(text, chunk_size=500, overlap=50):
    """재귀적 청킹 - 중첩된 텍스트 청킹"""
    chunks = []
    start = 0

    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap

    return chunks
Enter fullscreen mode Exit fullscreen mode

2.3 에이전트 기반 청킹

class AgentBasedChunker:
    def __init__(self, model):
        self.model = model

    def chunk_with_structure(self, text):
        """문서 구조를 고려한 청킹"""
        # 제목, 서브제목 기준으로 청킹
        sections = self.identify_sections(text)
        chunks = []

        for section in sections:
            if len(section['content']) > 1000:
                # 긴 섹션은 추가로 청킹
                sub_chunks = self.split_section(section['content'])
                chunks.extend(sub_chunks)
            else:
                chunks.append(section['content'])

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

3.1 모델 선택 기준

import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
            'sentence-bert': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        }

    def evaluate_models(self, texts, reference_embeddings):
        """모델 성능 비교"""
        results = {}

        for name, model in self.models.items():
            # 임베딩 생성
            embeddings = model.encode(texts)

            # 유사도 계산 (예: cosine similarity)
            similarities = self.calculate_similarities(
                embeddings, reference_embeddings
            )

            # 평가 점수
            avg_similarity = np.mean(similarities)
            results[name] = {
                'avg_similarity': avg_similarity,
                'model_size': self.get_model_size(model),
                'inference_time': self.benchmark_inference(model, texts)
            }

        return results

# 성능 비교 예시
benchmark = EmbeddingBenchmark()
results = benchmark.evaluate_models(sample_texts, reference_embeddings)
Enter fullscreen mode Exit fullscreen mode

3.2 최적화된 임베딩 생성

class OptimizedEmbedder:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.model.max_seq_length = 512  # 최적화된 시퀀스 길이

    def embed_batch(self, texts, batch_size=32):
        """배치 처리 최적화"""
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(
                batch, 
                show_progress_bar=False,
                convert_to_tensor=True
            )
            embeddings.append(batch_embeddings)

        return torch.cat(embeddings, dim=0)
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector vs Milvus

# Chroma 구현
import chromadb
from chromadb import Client

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = Client()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )

    def add_documents(self, documents, embeddings, ids):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=ids
        )

    def search(self, query_embedding, n_results=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        return results['documents'][0]

# Qdrant 구현
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)

    def create_collection(self, collection_name, vector_size):
        self.client.recreate_collection(
            collection_name=collection_name,
            vectors_config={"size": vector_size, "distance": "Cosine"}
        )

    def search(self, query_vector, collection_name, limit=5):
        results = self.client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit
        )
        return [hit.payload for hit in results]

# pgvector 구현
import psycopg2
from psycopg2.extras import execute_values

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)

    def search(self, query_vector, limit=5):
        with self.conn.cursor() as cursor:
            cursor.execute("""
                SELECT content, distance 
                FROM documents 
                ORDER BY embedding <-> %s 
                LIMIT %s
            """, (query_vector, limit))
            return cursor.fetchall()
Enter fullscreen mode Exit fullscreen mode

4.2 성능 비교 테스트

import time
import numpy as np

def benchmark_vector_dbs(documents, query_embeddings, k=5):
    """각 벡터 데이터베이스 성능 비교"""
    results = {}

    # Chroma 테스트
    chroma_db = ChromaVectorDB()
    start_time = time.time()
    chroma_results = [chroma_db.search(q, k) for q in query_embeddings[:100]]
    chroma_time = time.time() - start_time
    results['chroma'] = chroma_time

    # Qdrant 테스트
    qdrant_db = QdrantVectorDB()
    start_time = time.time()
    qdrant_results = [qdrant_db.search(q, "rag_collection", k) 
                     for q in query_embeddings[:100]]
    qdrant_time = time.time() - start_time
    results['qdrant'] = qdrant_time

    return results
Enter fullscreen mode Exit fullscreen mode

5. 전체 RAG 파이프라인 구현


python
import asyncio
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import numpy as np

class FullRAGPipeline:
    def __init__(self, 
                 embedding_model_name='all-MiniLM-L6-v2',
                 vector_db_type='chroma',
                 llm_model='gpt-3.5-turbo'):

        # 임베딩 모델
        self.embedder = SentenceTransformer(embedding_model_name)

        # 벡터 DB
        if vector_db_type == 'chroma':
            self.vector_db = ChromaVectorDB()
        elif vector_db_type == 'qdrant

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)