matias yoon

Posted on May 25

RAG 시스템 실전 구축 (v22)

#ai #llm #developers #tutorial

RAG 시스템 실전 구축 (v22)

개요

RAG (Retrieval-Augmented Generation) 시스템은 대규모 언어 모델(LLM)이 외부 지식을 활용하여 정확한 답변을 생성할 수 있도록 돕습니다. 이 가이드는 ML 엔지니어와 백엔드 개발자가 실전에서 RAG 시스템을 구축할 수 있도록 돕습니다.

1. RAG 기초 개념

RAG 시스템은 세 단계로 구성됩니다:

검색 (Retrieval): 질문에 관련된 문서/청크를 검색
증강 (Augmentation): 검색된 정보를 프롬프트에 통합
생성 (Generation): LLM이 증강된 프롬프트를 기반으로 답변 생성

기본 구조:

# RAG 루프 구현 예시
class BasicRAG:
    def __init__(self, embedding_model, vector_db, llm):
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.llm = llm

    def query(self, question):
        # 1. 질문 임베딩
        query_embedding = self.embedding_model.encode(question)

        # 2. 검색
        relevant_docs = self.vector_db.search(query_embedding, k=5)

        # 3. 증강 프롬프트 생성
        context = "\n".join([doc.content for doc in relevant_docs])
        prompt = f"질문: {question}\n참고문서: {context}"

        # 4. 생성
        answer = self.llm.generate(prompt)
        return answer

2. 청킹 전략

2.1 의미적 청킹 (Semantic Chunking)

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, threshold=0.75):
        # 문장 단위로 분할
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # 의미 차이가 큰 지점에서 청킹
        chunks = []
        current_chunk = []
        current_embedding = None

        for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
            if i == 0:
                current_chunk.append(sentence)
                current_embedding = embedding
            else:
                similarity = np.dot(current_embedding, embedding) / (
                    np.linalg.norm(current_embedding) * np.linalg.norm(embedding)
                )

                if similarity < threshold:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = [sentence]
                    current_embedding = embedding
                else:
                    current_chunk.append(sentence)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

# 사용 예시
chunker = SemanticChunker()
text = "RAG 시스템은 검색된 정보를 기반으로 LLM이 정확한 답변을 생성합니다. 이 시스템은 외부 지식을 활용하여 질문에 대한 답변을 제공합니다."
chunks = chunker.chunk_by_semantic(text)
print(chunks)

2.2 재귀적 청킹 (Recursive Chunking)

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=64):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_recursive(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]

            # 중복 제거를 위한 오버랩 처리
            if start > 0:
                overlap_start = max(0, start - self.overlap)
                overlap_chunk = text[overlap_start:start]
                chunk = overlap_chunk + chunk

            chunks.append(chunk)
            start = end - self.overlap

        return chunks

# 사용 예시
recursive_chunker = RecursiveChunker(chunk_size=256, overlap=32)
chunks = recursive_chunker.chunk_recursive(text)

3. 임베딩 모델 선택 및 비교

3.1 모델 비교 코드

import time
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
            'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
            'BAAI/bge-small-en': SentenceTransformer('BAAI/bge-small-en')
        }

    def benchmark_model(self, model_name, texts):
        model = self.models[model_name]

        # 추론 시간 측정
        start_time = time.time()
        embeddings = model.encode(texts)
        end_time = time.time()

        return {
            'model': model_name,
            'inference_time': end_time - start_time,
            'embedding_size': len(embeddings[0]),
            'memory_usage': len(embeddings) * len(embeddings[0]) * 4  # float32 크기
        }

    def compare_models(self, test_texts):
        results = []
        for model_name in self.models:
            result = self.benchmark_model(model_name, test_texts)
            results.append(result)

        return results

# 모델 비교
benchmark = EmbeddingBenchmark()
test_texts = ["RAG 시스템의 기본 개념", "검색과 증강의 역할", "생성 모델의 작동 방식"]
results = benchmark.compare_models(test_texts)

for result in results:
    print(f"{result['model']}: {result['inference_time']:.3f}초, "
          f"{result['embedding_size']}차원, "
          f"{result['memory_usage']/1024/1024:.2f}MB")

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector

# Chroma 사용 예시
import chromadb
from chromadb.utils import embedding_functions

class ChromaDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )

    def add_documents(self, documents, ids):
        self.collection.add(
            documents=documents,
            ids=ids
        )

    def search(self, query, top_k=5):
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k
        )
        return results['documents'][0]

# Qdrant 사용 예시
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Filter, FieldCondition

class QdrantDB:
    def __init__(self, host="localhost", port=6333):
        self.client = QdrantClient(host=host, port=port)

    def create_collection(self, collection_name, vector_size=384):
        self.client.recreate_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Cosine")
        )

    def add_documents(self, collection_name, documents, ids):
        self.client.upsert(
            collection_name=collection_name,
            points=[
                {
                    "id": id,
                    "vector": doc.embedding,
                    "payload": {"text": doc.content}
                }
                for id, doc in zip(ids, documents)
            ]
        )

# pgvector 사용 예시
import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS rag_documents (
                    id UUID PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(384)
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents 
                USING ivfflat (embedding vector_cosine_ops)
            """)
        self.conn.commit()

    def search(self, query_embedding, top_k=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT id, content, embedding 
                FROM rag_documents 
                ORDER BY embedding <-> %s
                LIMIT %s
            """, (query_embedding, top_k))
            return cur.fetchall()

5. 전체 RAG 파이프라인 코드


python
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import json

class RAGPipeline:
    def __init__(self, embedding_model_name="all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.documents = []
        self.embeddings = []
        self.id_to_doc = {}

    def add_documents(self, documents: List[Dict[str, Any]]

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)

DEV Community

RAG 시스템 실전 구축 (v22)

RAG 시스템 실전 구축 (v22)

개요

1. RAG 기초 개념

기본 구조:

2. 청킹 전략

2.1 의미적 청킹 (Semantic Chunking)

2.2 재귀적 청킹 (Recursive Chunking)

3. 임베딩 모델 선택 및 비교

3.1 모델 비교 코드

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector

5. 전체 RAG 파이프라인 코드

Top comments (0)