matias yoon

Posted on May 25

RAG 시스템 실전 구축 (v27)

#ai #llm #developers #tutorial

RAG 시스템 실전 구축 (v27)

1. RAG 기초 개념

Retrieval-Augmented Generation (RAG)은 정보 검색과 생성 모델의 결합으로, 기존 지식 베이스를 활용해 정확한 답변을 생성하는 시스템입니다. RAG의 핵심 루프는 다음과 같습니다:

검색 (Retrieval): 사용자 쿼리와 유사한 문서 조각 찾기
보강 (Augmentation): 검색된 문서를 프롬프트에 통합
생성 (Generation): LLM이 보강된 프롬프트를 기반으로 답변 생성

# 간단한 RAG 루프 구현
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimpleRAG:
    def __init__(self, embeddings, documents, vector_db):
        self.embeddings = embeddings
        self.documents = documents
        self.vector_db = vector_db

    def retrieve(self, query, k=3):
        query_embedding = self.embeddings.encode([query])[0]
        retrieved_indices = self.vector_db.search(query_embedding, k)
        return [self.documents[i] for i in retrieved_indices]

    def generate(self, query, retrieved_docs):
        # 간단한 프롬프트 구성
        context = "\n".join(retrieved_docs)
        prompt = f"질문: {query}\n문맥: {context}\n답변:"
        return self.llm.generate(prompt)

2. Chunking 전략

문서를 적절한 크기로 나누는 것이 중요합니다. 세 가지 주요 전략:

Semantic Chunking

의미 기반으로 문서 분할

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, threshold=0.75):
        sentences = text.split('.')
        embeddings = self.model.encode(sentences)

        chunks = []
        current_chunk = []
        current_embedding = None

        for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
            if i == 0:
                current_chunk.append(sentence)
                current_embedding = embedding
            else:
                similarity = cosine_similarity([current_embedding], [embedding])[0][0]
                if similarity > threshold:
                    current_chunk.append(sentence)
                else:
                    chunks.append('.'.join(current_chunk))
                    current_chunk = [sentence]
                    current_embedding = embedding

        if current_chunk:
            chunks.append('.'.join(current_chunk))
        return chunks

Recursive Chunking

재귀적으로 작은 조각으로 분할

def recursive_chunking(text, max_chunk_size=512, overlap=50):
    if len(text) <= max_chunk_size:
        return [text]

    chunks = []
    start = 0

    while start < len(text):
        end = min(start + max_chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap

    return chunks

Agentic Chunking

LLM 기반 자동 분할

from openai import OpenAI

class AgenticChunker:
    def __init__(self, client):
        self.client = client

    def chunk_with_llm(self, text, max_tokens=200):
        prompt = f"""
        다음 텍스트를 {max_tokens} 토큰 이내로 분할해주세요:
        {text}
        분할된 각 조각은 의미가 완전한 문장이어야 하며, 
        최대한 의미를 유지하면서 분할해주세요.
        """

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1000
        )

        # 응답 파싱 및 분할
        return response.choices[0].message.content.split('\n')

3. 임베딩 모델 선택 및 비교

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en'),
            'sentence-t5-encoder': SentenceTransformer('sentence-t5-encoder')
        }

    def evaluate_models(self, texts):
        results = {}
        for name, model in self.models.items():
            embeddings = model.encode(texts)
            results[name] = {
                'embedding_size': embeddings.shape[1],
                'memory_usage': embeddings.nbytes / (1024**2),  # MB
                'speed': len(texts) / 1000  # 예시 속도
            }
        return results

# 사용 예시
benchmark = EmbeddingBenchmark()
texts = ["문서 1 내용", "문서 2 내용", "문서 3 내용"]
results = benchmark.evaluate_models(texts)

4. 벡터 데이터베이스 비교

Chroma

import chromadb
from chromadb import Client

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = Client()
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, ids):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=ids
        )

    def search(self, query_embedding, n_results=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        return results['ids'][0]

Qdrant

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

        # 컬렉션 생성
        self.client.recreate_collection(
            collection_name=collection_name,
            vectors_config={"size": 384, "distance": "Cosine"}
        )

    def add_documents(self, documents, embeddings, ids):
        points = [
            {
                "id": idx,
                "vector": embedding.tolist(),
                "payload": {"text": doc}
            }
            for idx, (doc, embedding) in enumerate(zip(documents, embeddings))
        ]
        self.client.upsert(collection_name=self.collection_name, points=points)

    def search(self, query_embedding, n_results=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding.tolist(),
            limit=n_results
        )
        return [point.id for point in results]

5. 완전한 RAG 파이프라인 구현


python
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client

class CompleteRAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        # 1. 임베딩 모델 초기화
        self.embedding_model = SentenceTransformer(model_name)

        # 2. 벡터 데이터베이스 초기화
        self.client = Client()
        self.collection = self.client.get_or_create_collection("rag_docs")

        # 3. LLM 초기화 (예시로 LLaMA 사용)
        from transformers import LlamaTokenizer, LlamaForCausalLM
        self.tokenizer = LlamaTokenizer.from_pretrained("llama-2-7b")
        self.model = LlamaForCausalLM.from_pretrained("llama-2-7b")

    def setup_document_store(self, documents):
        """문서 저장소 설정"""
        embeddings = self.embedding_model.encode(documents)

        # 문서 ID 생성
        ids = [f"doc_{i}" for i in range(len(documents))]

        # 벡터 저장
        self.collection.add(
            documents=documents,
            embeddings=embeddings.tolist(),
            ids=ids
        )

        print(f"총 {len(documents)}개 문서가 저장되었습니다.")

    def retrieve(self, query, k=3):
        """문서 검색"""
        query_embedding = self.embedding_model.encode([query])[0]

        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )

        return results['documents'][0]

    def generate_response(self, query, retrieved_docs):
        """답변 생성"""
        context = "\n".join(retrieved_docs)
        prompt = f"""
        질문: {query}
        문맥: {context}
        답변:
        """

        # 토크나이저로 프롬프트 인

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)

DEV Community

RAG 시스템 실전 구축 (v27)

RAG 시스템 실전 구축 (v27)

1. RAG 기초 개념

2. Chunking 전략

Semantic Chunking

Recursive Chunking

Agentic Chunking

3. 임베딩 모델 선택 및 비교

4. 벡터 데이터베이스 비교

Chroma

Qdrant

5. 완전한 RAG 파이프라인 구현

Top comments (0)