DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v28)

RAG 시스템 실전 구축 (v28)

1. RAG 기초 개념: 검색 → 보완 → 생성 루프

RAG(Retrieval-Augmented Generation)는 LLM의 생성 능력을 외부 지식 소스와 결합하여 더욱 정확하고 최신 정보를 제공하는 아키텍처입니다. 세 가지 주요 단계로 구성됩니다:

1.1 검색 단계 (Retrieval)

문서에서 관련 정보를 찾는 단계. 임베딩 벡터와 유사도 검색을 통해 최상위 K개의 문서를 선택합니다.

1.2 보완 단계 (Augmentation)

검색된 문서를 프롬프트에 포함하여 LLM이 생성할 때 보다 풍부한 맥락을 제공합니다.

1.3 생성 단계 (Generation)

LLM이 보완된 프롬프트를 기반으로 응답을 생성합니다.

# 간단한 RAG 루프 구현
class SimpleRAG:
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db

    def retrieve(self, query, k=5):
        query_embedding = self.embedding_model.encode([query])
        return self.vector_db.search(query_embedding, k)

    def generate(self, query, retrieved_docs):
        prompt = f"질문: {query}\n참조 문서:\n" + "\n".join(retrieved_docs)
        # LLM 호출 (예: openai.ChatCompletion.create)
        return response.choices[0].message.content
Enter fullscreen mode Exit fullscreen mode

2. 청킹 전략 (Chunking Strategies)

문서를 적절한 크기로 나누는 전략은 RAG 성능에 결정적 영향을 미칩니다.

2.1 의미적 청킹 (Semantic Chunking)

문서를 의미 단위로 분할하여 의미 있는 컨텍스트를 유지합니다.

# Semantic chunking 예제
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SemanticChunker:
    def __init__(self, embedding_model):
        self.model = embedding_model

    def chunk_semantically(self, text, min_chunk_size=100, max_chunk_size=500):
        sentences = text.split('. ')
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence)
            if current_length + sentence_length > max_chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks
Enter fullscreen mode Exit fullscreen mode

2.2 재귀적 청킹 (Recursive Chunking)

문서를 여러 수준의 청크로 분할하는 방식. 헤더와 본문을 분리하여 의미를 유지합니다.

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def recursive_chunk(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]
            chunks.append(chunk)
            start = end - self.overlap

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

임베딩 품질은 검색 정확도에 직접적인 영향을 줍니다.

# 임베딩 모델 비교
from sentence_transformers import SentenceTransformer
import numpy as np

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'sentence-t5-xxl': SentenceTransformer('sentence-t5-xxl'),
            'bge-small-en': SentenceTransformer('BAAI/bge-small-en')
        }

    def evaluate_model(self, texts, model_name):
        model = self.models[model_name]
        embeddings = model.encode(texts)
        return embeddings

    def compare_embeddings(self, text_pairs):
        # 임베딩 유사도 비교
        results = {}
        for name, model in self.models.items():
            embeddings = model.encode(text_pairs)
            similarity = np.dot(embeddings[0], embeddings[1]) / (
                np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
            )
            results[name] = similarity
        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
text_pairs = ["This is a test sentence.", "This is another test sentence."]
similarities = benchmark.compare_embeddings(text_pairs)
print(similarities)
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교

다양한 벡터 데이터베이스의 성능과 특성 비교:

4.1 Chroma

import chromadb
from chromadb import Client

client = Client()
collection = client.get_or_create_collection("rag_collection")

def chroma_store_documents(documents):
    embeddings = embedding_model.encode(documents)
    collection.add(
        documents=documents,
        embeddings=embeddings,
        ids=[f"doc_{i}" for i in range(len(documents))]
    )
Enter fullscreen mode Exit fullscreen mode

4.2 Qdrant

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Filter

client = QdrantClient(host="localhost", port=6333)
client.recreate_collection(
    collection_name="rag_collection",
    vectors_config=VectorParams(size=384, distance="Cosine")
)
Enter fullscreen mode Exit fullscreen mode

4.3 pgvector

-- PostgreSQL + pgvector 설치
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    content TEXT,
    embedding VECTOR(384)
);
Enter fullscreen mode Exit fullscreen mode

4.4 Milvus

from pymilvus import Collection, FieldSchema, DataType, CollectionSchema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)
]
schema = CollectionSchema(fields, "RAG collection")
collection = Collection("rag_collection", schema)
Enter fullscreen mode Exit fullscreen mode

5. 완전한 RAG 파이프라인 구현

import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np

class CompleteRAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        # 초기화
        self.embedding_model = SentenceTransformer(model_name)
        self.vector_db = Client()
        self.collection = self.vector_db.get_or_create_collection("rag_docs")

    def add_documents(self, documents):
        """문서 추가"""
        embeddings = self.embedding_model.encode(documents)
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=[f"doc_{i}" for i in range(len(documents))]
        )

    def search_documents(self, query, k=5):
        """문서 검색"""
        query_embedding = self.embedding_model.encode([query])
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=k,
            include=['documents']
        )
        return results['documents'][0]

    def generate_response(self, query, context_docs):
        """응답 생성 (예시로 openai API 사용)"""
        import openai
        openai.api_key = os.getenv("OPENAI_API_KEY")

        prompt = f"질문: {query}\n참조 문헌:\n" + "\n".join(context_docs)

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500
        )
        return response.choices[0].message.content

    def rag_query(self, query):
        """전체 RAG 쿼리 처리"""
        # 1. 검색
        context_docs = self.search_documents(query, k=5)
        # 2. 생성
        response = self.generate_response(query, context_docs)
        return response

# 사용 예시
pipeline = CompleteRAGPipeline()
pipeline.add_documents(["문서 내용 1", "문서 내용 2"])
result = pipeline.rag_query("질문 내용")
print(result)
Enter fullscreen mode Exit fullscreen mode

6. 고급 기능: 쿼리 변환, 하이브리드 검색, 재순위

6.1 쿼리 변환


python
class QueryTransformer:
    def __init__(self):
        pass

    def transform_query(self, query):
        # 쿼리 최적화
        transformations = [
            lambda q: q.lower(),
            lambda q: q.replace("?", "").replace("!", ""),
            lambda q: self.expand_query_terms(q)
        ]

        for transform in transformations:


---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)