matias yoon

Posted on May 25

RAG 시스템 실전 구축 (v36)

#ai #llm #developers #tutorial

RAG 시스템 실전 구축 (v36)

1. RAG 기초 개념

RAG (Retrieval-Augmented Generation)은 정보 검색과 생성 모델을 통합한 아키텍처로, 대규모 언어 모델(LLM)이 외부 데이터를 기반으로 정확한 응답을 생성할 수 있도록 합니다. 이 시스템은 다음과 같은 핵심 루프를 따릅니다:

질문 → 검색 → 증강 → 생성

검색 단계에서는 질문과 관련된 문서 청크를 벡터 데이터베이스에서 찾습니다. 증강 단계에서는 검색된 문서를 프롬프트에 포함시켜 LLM에게 추가 정보를 제공합니다. 생성 단계에서는 LLM이 증강된 프롬프트를 기반으로 답변을 생성합니다.

2. 청킹 전략

2.1 Semantic Chunking

의미 기반 청킹은 문장 단위로 청크를 분할하여 의미적 연관성을 유지합니다.

import tiktoken
from typing import List, Dict

class SemanticChunker:
    def __init__(self, chunk_size: int = 512, overlap: int = 64):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.tokenizer = tiktoken.get_encoding("cl100k_base")

    def chunk_text(self, text: str) -> List[str]:
        tokens = self.tokenizer.encode(text)
        chunks = []

        for i in range(0, len(tokens), self.chunk_size - self.overlap):
            chunk_tokens = tokens[i:i + self.chunk_size]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)

        return chunks

# 사용 예시
chunker = SemanticChunker(chunk_size=512, overlap=64)
document = "..." # 긴 문서
chunks = chunker.chunk_text(document)

2.2 Recursive Chunking

재귀적 청킹은 문서를 단계적으로 분할하여 최적의 청크 크기를 찾습니다.

import re
from typing import List

class RecursiveChunker:
    def __init__(self, max_chunk_size: int = 1024):
        self.max_chunk_size = max_chunk_size

    def chunk_text(self, text: str) -> List[str]:
        # 문장 단위 분할
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < self.max_chunk_size:
                current_chunk += sentence + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

2.3 Agentic Chunking

엔티티 기반 청킹은 문서에서 특정 엔티티(사람, 장소, 조직)를 기준으로 분할합니다.

import spacy
from typing import List, Dict

class AgenticChunker:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def chunk_by_entities(self, text: str, entity_types: List[str] = None) -> List[str]:
        doc = self.nlp(text)
        chunks = []
        current_chunk = ""

        for ent in doc.ents:
            if entity_types is None or ent.label_ in entity_types:
                if len(current_chunk) > 0:
                    chunks.append(current_chunk.strip())
                current_chunk = ent.text + " "

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

3. 임베딩 모델 선택 및 비교

3.1 모델 비교 테스트

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'sentence-t5-base': SentenceTransformer('sentence-t5-base'),
            'multi-qa-MiniLM-L6-v2': SentenceTransformer('multi-qa-MiniLM-L6-v2')
        }

    def benchmark_models(self, texts: List[str], query: str) -> Dict[str, float]:
        results = {}
        query_embedding = self.models['all-MiniLM-L6-v2'].encode([query])

        for name, model in self.models.items():
            embeddings = model.encode(texts)
            similarity = cosine_similarity(query_embedding, embeddings)[0]
            results[name] = float(np.mean(similarity))

        return results

# 성능 비교
benchmark = EmbeddingBenchmark()
texts = ["AI technology", "Machine Learning", "Deep Learning"]
query = "artificial intelligence"
scores = benchmark.benchmark_models(texts, query)
print(scores)

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector

# Chroma 구현
import chromadb
from chromadb.utils import embedding_functions

class ChromaVectorDB:
    def __init__(self, collection_name: str):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )

    def add_documents(self, documents: List[str], ids: List[str]):
        self.collection.add(
            documents=documents,
            ids=ids
        )

    def search(self, query: str, limit: int = 5):
        results = self.collection.query(
            query_texts=[query],
            n_results=limit
        )
        return results

# Qdrant 구현
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Filter, FieldCondition

class QdrantVectorDB:
    def __init__(self, collection_name: str):
        self.client = QdrantClient(":memory:")  # 메모리 기반
        self.collection_name = collection_name

        # 컬렉션 생성
        self.client.recreate_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=384, distance="Cosine")
        )

    def add_documents(self, documents: List[str], ids: List[str]):
        points = [
            {
                "id": idx,
                "vector": self.get_embedding(doc),
                "payload": {"text": doc}
            }
            for idx, doc in zip(ids, documents)
        ]
        self.client.upsert(self.collection_name, points)

    def search(self, query: str, limit: int = 5):
        query_vector = self.get_embedding(query)
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=limit
        )
        return results

    def get_embedding(self, text: str):
        # 임베딩 모델 구현 필요
        pass

5. 전체 RAG 파이프라인 구현


python
import asyncio
from typing import List, Dict, Any
import logging

class RAGPipeline:
    def __init__(self, embedding_model: str, vector_db: Any, llm: Any):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.vector_db = vector_db
        self.llm = llm
        self.chunker = SemanticChunker()
        self.logger = logging.getLogger(__name__)

    async def build_index(self, documents: List[Dict[str, str]]):
        """문서 인덱싱"""
        all_chunks = []
        ids = []

        for doc in documents:
            chunks = self.chunker.chunk_text(doc['content'])
            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                ids.append(f"{doc['id']}_{i}")

        # 임베딩 생성
        embeddings = self.embedding_model.encode(all_chunks)

        # 벡터 DB에 저장
        self.vector_db.add_documents(all_chunks, ids)
        self.logger.info(f"인덱싱 완료: {len(all_chunks)}개 청크")

    async def retrieve_and_generate(self, query: str, top_k: int = 5) -> Dict[str, Any]:
        """검색 및 생성"""
        # 검색
        results = self.vector_db.search(query, top_k)

        # 검색 결과 정리
        retrieved_docs = [doc['text'] for doc in results['documents'][0]]

        # 프롬프트 생성
        context = "\n".join(retrieved_docs)
        prompt = f"""
        다음 문맥을 기반으로 질문에 답하세요:
        문맥: {context}
        질문: {query}
        답변:
        """

        # 생성
        response = self.llm.generate(prompt)

        return {
            'query': query,
            'retrieved_docs': retrieved_docs,
            'answer': response,
            'confidence': self.calculate_confidence(results)
        }

    def calculate_confidence(self, search_results: Dict) -> float:
        # 유사도 기반 신뢰도 계산
        if 'distances' in search_results

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)

DEV Community

RAG 시스템 실전 구축 (v36)

RAG 시스템 실전 구축 (v36)

1. RAG 기초 개념

2. 청킹 전략

2.1 Semantic Chunking

2.2 Recursive Chunking

2.3 Agentic Chunking

3. 임베딩 모델 선택 및 비교

3.1 모델 비교 테스트

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector

5. 전체 RAG 파이프라인 구현

Top comments (0)