DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v33)

RAG 시스템 실전 구축 (v33)

1. RAG 기초 개념: 검색 → 보완 → 생성 루프

RAG (Retrieval-Augmented Generation)는 대규모 언어 모델(LLM)의 정보 지식 범위를 확장하는 데 핵심적인 아키텍처입니다. RAG는 세 가지 주요 단계로 구성됩니다:

  1. 검색 (Retrieval): 사용자 질문과 관련된 문서 조각들을 벡터 데이터베이스에서 찾습니다.
  2. 보완 (Augmentation): 검색된 문서를 프롬프트에 통합하여 LLM이 더 정확한 답변을 생성할 수 있도록 합니다.
  3. 생성 (Generation): LLM이 보완된 프롬프트를 기반으로 최종 답변을 생성합니다.

이 루프는 다음과 같은 순서로 실행됩니다:

# RAG 루프의 간단한 구현 예시
class SimpleRAG:
    def __init__(self, embedding_model, vector_db, llm):
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.llm = llm

    def generate(self, query):
        # 1. 검색: 질문을 벡터로 변환하고 유사도 검색
        query_vector = self.embedding_model.encode(query)
        retrieved_docs = self.vector_db.search(query_vector, k=5)

        # 2. 보완: 검색된 문서를 프롬프트에 포함
        context = "\n".join([doc['content'] for doc in retrieved_docs])
        prompt = f"질문: {query}\n문맥: {context}\n답변:"

        # 3. 생성: LLM에게 답변 생성 요청
        answer = self.llm.generate(prompt)
        return answer
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략: 의미론적, 재귀적, 에이전트 기반

청킹은 문서를 의미 단위로 분할하는 과정입니다. 각 전략은 서로 다른 장단점을 가집니다:

의미론적 청킹

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk(self, text, min_chunk_size=200, max_chunk_size=800):
        # 문장 단위로 분할
        sentences = text.split('. ')
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) > max_chunk_size:
                if len(current_chunk) >= min_chunk_size:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "
            else:
                current_chunk += sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks
Enter fullscreen mode Exit fullscreen mode

재귀적 청킹

import re

class RecursiveChunker:
    def __init__(self, chunk_size=512):
        self.chunk_size = chunk_size

    def chunk(self, text):
        # 문단 단위로 청킹
        paragraphs = text.split('\n\n')
        chunks = []

        for para in paragraphs:
            if len(para) > self.chunk_size:
                # 긴 문단은 더 작은 단위로 분할
                chunks.extend(self._split_long_paragraph(para))
            else:
                chunks.append(para)
        return chunks

    def _split_long_paragraph(self, para):
        sentences = re.split(r'[.!?]+', para)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) > self.chunk_size:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "
            else:
                current_chunk += sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

임베딩 품질은 RAG 성능에 직접적인 영향을 미칩니다. 다음은 주요 모델들 간 비교입니다:

Embedding 모델 비교 코드

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
            'sentence-t5': SentenceTransformer('sentence-t5')
        }

    def evaluate_models(self, texts):
        results = {}
        for name, model in self.models.items():
            embeddings = model.encode(texts)
            results[name] = {
                'avg_norm': np.mean([np.linalg.norm(vec) for vec in embeddings]),
                'embedding_dim': len(embeddings[0])
            }
        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
texts = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning models require large amounts of training data",
    "Natural language processing helps computers understand human language"
]

model_performance = benchmark.evaluate_models(texts)
print(model_performance)
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교 (Chroma, Qdrant, pgvector, Milvus)

각 벡터 데이터베이스는 성능과 기능 측면에서 차이가 있습니다:

Chroma (가벼운 로컬 솔루션)

import chromadb
from chromadb import Client

# Chroma 클라이언트 초기화
client = Client()
collection = client.get_or_create_collection("rag_docs")

def chroma_add_documents(documents, embeddings):
    collection.add(
        documents=[doc['content'] for doc in documents],
        embeddings=embeddings,
        ids=[doc['id'] for doc in documents]
    )

def chroma_search(query_embedding, n_results=5):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results['documents'][0]
Enter fullscreen mode Exit fullscreen mode

Qdrant (고성능 분산 솔루션)

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

client = QdrantClient(host="localhost", port=6333)

def qdrant_add_documents(documents, embeddings):
    client.upload_collection(
        collection_name="rag_docs",
        vectors=embeddings,
        ids=[doc['id'] for doc in documents],
        payload=[{"content": doc['content']} for doc in documents]
    )

def qdrant_search(query_vector, n_results=5):
    results = client.search(
        collection_name="rag_docs",
        query_vector=query_vector,
        limit=n_results
    )
    return [hit.payload['content'] for hit in results]
Enter fullscreen mode Exit fullscreen mode

5. 전체 RAG 파이프라인 코드


python
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from transformers import pipeline
import os

class RAGPipeline:
    def __init__(self, embedding_model_name='all-MiniLM-L6-v2'):
        # 초기화
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.vector_db = QdrantClient(host="localhost", port=6333)
        self.llm = pipeline("text-generation", model="gpt2")

        # 컬렉션 생성
        try:
            self.vector_db.create_collection(
                collection_name="rag_docs",
                vectors_config={"size": 384, "distance": "Cosine"}
            )
        except:
            pass

    def add_documents(self, documents):
        """문서 추가"""
        embeddings = self.embedding_model.encode([doc['content'] for doc in documents])
        self.vector_db.upload_collection(
            collection_name="rag_docs",
            vectors=embeddings,
            ids=[doc['id'] for doc in documents],
            payload=[{"content": doc['content']} for doc in documents]
        )

    def retrieve(self, query, k=5):
        """검색"""
        query_embedding = self.embedding_model.encode([query])
        results = self.vector_db.search(
            collection_name="rag_docs",
            query_vector=query_embedding[0],
            limit=k
        )
        return [hit.payload['content'] for hit in results]

    def generate(self, query, context=None):
        """생성"""
        if context is None:
            context = self.retrieve(query)

        context_str = "\n".join(context)
        prompt = f"질문: {query}\n문맥: {context_str}\n답변:"

        response = self.llm(
            prompt,
            max_length=200,
            num_return_sequences=1,
            truncation=True
        )

        return response[0]['generated_text'].replace(prompt, "").strip()

# 사용 예시
rag = RAGPipeline()
documents = [
    {"id": "1", "content": "Python은 인터프리터 언어로, 간결하고

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)