DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v39)

RAG 시스템 실전 구축 (v39)

실제 생산 환경에서 RAG 시스템을 구축하고 운영하는 실전 가이드

1. RAG 기본 개념: 검색 → 보완 → 생성 루프

RAG(Retrieval-Augmented Generation)는 검색 기반의 정보를 활용하여 LLM이 더 정확한 응답을 생성하는 아키텍처입니다.

사용자 질문 → 검색 엔진 → 관련 문서 → LLM 보완 → 생성된 응답
Enter fullscreen mode Exit fullscreen mode

기본적인 RAG 루프 구조는 다음과 같습니다:

class BasicRAGPipeline:
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db

    def retrieve(self, query, k=5):
        query_embedding = self.embedding_model.encode([query])[0]
        return self.vector_db.search(query_embedding, k=k)

    def generate(self, query, context):
        prompt = f"질문: {query}\n문맥: {context}\n답변:"
        return self.llm.generate(prompt)

    def process(self, query):
        context = self.retrieve(query)
        return self.generate(query, context)
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략: 의미, 재귀, 에이전트 기반

청킹 전략은 문서를 적절한 크기로 분할하여 임베딩 및 검색에 최적화합니다:

import tiktoken
from typing import List

class ChunkingStrategies:

    @staticmethod
    def semantic_chunking(documents: List[str], chunk_size: int = 500) -> List[str]:
        """의미 기반 청킹"""
        # 문서 내에서 의미 단위를 기준으로 청킹
        chunks = []
        for doc in documents:
            sentences = doc.split('. ')
            current_chunk = ""

            for sentence in sentences:
                if len(current_chunk) + len(sentence) < chunk_size:
                    current_chunk += sentence + ". "
                else:
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                    current_chunk = sentence + ". "

            if current_chunk:
                chunks.append(current_chunk.strip())
        return chunks

    @staticmethod
    def recursive_chunking(documents: List[str], chunk_size: int = 500) -> List[str]:
        """재귀적 청킹 - 토큰 기준"""
        tokenizer = tiktoken.get_encoding("cl100k_base")
        chunks = []

        for doc in documents:
            tokens = tokenizer.encode(doc)
            for i in range(0, len(tokens), chunk_size):
                chunk_tokens = tokens[i:i+chunk_size]
                chunk_text = tokenizer.decode(chunk_tokens)
                chunks.append(chunk_text)
        return chunks

# 사용 예시
strategies = ChunkingStrategies()
docs = ["문서 내용..."]
semantic_chunks = strategies.semantic_chunking(docs)
recursive_chunks = strategies.recursive_chunking(docs)
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택과 비교

임베딩 품질은 RAG 성능에 직접적인 영향을 미칩니다:

from sentence_transformers import SentenceTransformer
import numpy as np

class EmbeddingModelComparison:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'paraphrase-multilingual-MiniLM-v2': SentenceTransformer('paraphrase-multilingual-MiniLM-v2'),
            'gte-small': SentenceTransformer('sentence-transformers/gte-small')
        }

    def compare_embeddings(self, texts: List[str], model_name: str) -> np.ndarray:
        model = self.models[model_name]
        return model.encode(texts)

    def benchmark_models(self, test_queries: List[str], test_docs: List[str]) -> dict:
        results = {}
        for name, model in self.models.items():
            # 성능 측정
            embeddings = model.encode(test_queries + test_docs)
            results[name] = {
                'embedding_time': len(embeddings),  # 실제 시간 측정 필요
                'dimension': embeddings.shape[1]
            }
        return results

# 비교 실행
comparator = EmbeddingModelComparison()
benchmark_results = comparator.benchmark_models(["쿼리1", "쿼리2"], ["문서1", "문서2"])
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교: Chroma vs Qdrant vs pgvector vs Milvus

특성 Chroma Qdrant pgvector Milvus
설치 간편성 ★★★★★ ★★★★☆ ★★★★☆ ★★★☆☆
성능 ★★★★☆ ★★★★★ ★★★★☆ ★★★★★
비용 ★★★★☆ ★★★★☆ ★★★★★ ★★★★☆
확장성 ★★★☆☆ ★★★★☆ ★★★★☆ ★★★★★
# Chroma 벡터 DB 구현
from chromadb import Client
from chromadb.config import Settings

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = Client(Settings(chroma_db_impl="duckdb", persist_directory="./chroma_db"))
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents: List[str], embeddings: List[List[float]]):
        ids = [str(i) for i in range(len(documents))]
        self.collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=documents
        )

    def search(self, query_embedding: List[float], k: int = 5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )
        return results['documents'][0]

# Qdrant 구현
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = "rag_collection"

    def create_collection(self, vector_size: int):
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": vector_size, "distance": "Cosine"}
        )

    def search(self, query_embedding: List[float], k: int = 5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=k
        )
        return [hit.payload['text'] for hit in results]
Enter fullscreen mode Exit fullscreen mode

5. 전체 RAG 파이프라인 코드


python
import numpy as np
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings

class RAGPipeline:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        # 1. 임베딩 모델 초기화
        self.embedding_model = SentenceTransformer(model_name)

        # 2. 벡터 DB 초기화
        self.client = Client(Settings(chroma_db_impl="duckdb", persist_directory="./rag_db"))
        self.collection = self.client.get_or_create_collection("documents")

        # 3. LLM 초기화 (예시: HuggingFace)
        from transformers import pipeline
        self.llm = pipeline("text-generation", 
                           model="gpt2", 
                           max_length=200)

    def add_documents(self, documents: List[str]):
        """문서 추가"""
        embeddings = self.embedding_model.encode(documents)
        ids = [str(i) for i in range(len(documents))]

        self.collection.add(
            ids=ids,
            embeddings=embeddings.tolist(),
            documents=documents
        )
        return f"Added {len(documents)} documents"

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        """문서 검색"""
        query_embedding = self.embedding_model.encode([query])[0]
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )
        return results['documents'][0]

    def generate_response(self, query: str, context: List[str]) -> str:
        """응답 생성"""
        context_text = "\n".join(context)
        prompt = f"""
질문: {query}
문맥: {context_text}
답변:"""

        response = self.llm(prompt, return_full_text=False)[0]['generated_text']
        return response.strip()

    def process_query(self, query: str) -> str:
        """전체 파이프라인 실행"""
        context = self.retrieve(query)
        return self.generate_response(query, context)

# 사용 예시
pipeline = RAGPipeline()
documents = [
    "Python은 고급 프로그래밍 언어입니다. 객체 지향 프로그래밍을 지원합니다.",
    "JavaScript는 웹 브라우저에서 실행되는 스크립트 언어입니다.",
    "Machine Learning은 데이터를 기반으로 학습하는 알고리즘입니다."
]

pipeline.add_documents(doc

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)