DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v44)

RAG 시스템 실전 구축 (v44)

1. RAG 시스템의 핵심 구성 요소

RAG(Retrieval-Augmented Generation) 시스템은 검색 기반 생성을 위한 핵심 아키텍처입니다. 이 시스템은 세 가지 주요 단계로 구성됩니다:

  1. 검색 (Retrieval): 질문과 관련된 문서 또는 문단을 검색합니다
  2. 증강 (Augmentation): 검색된 정보를 프롬프트에 통합합니다
  3. 생성 (Generation): 증강된 프롬프트를 기반으로 답변을 생성합니다

이러한 루프를 통해 LLM은 외부 지식을 활용하여 정확하고 최신 정보를 제공할 수 있습니다.

2. 청킹 전략 (Chunking Strategies)

2.1 의미적 청킹 (Semantic Chunking)

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic_similarity(self, text, threshold=0.7):
        # 텍스트를 문장 단위로 분할
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # 클러스터링을 통해 의미적 유사도 기반 청킹
        kmeans = KMeans(n_clusters=max(1, len(sentences)//2))
        kmeans.fit(embeddings)

        # 같은 클러스터에 속한 문장을 하나의 청크로 결합
        chunks = []
        for i in range(len(np.unique(kmeans.labels_))):
            cluster_sentences = [sentences[j] for j in range(len(sentences)) 
                               if kmeans.labels_[j] == i]
            chunks.append('. '.join(cluster_sentences))
        return chunks
Enter fullscreen mode Exit fullscreen mode

2.2 재귀적 청킹 (Recursive Chunking)

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def recursive_split(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]

            # 청크가 문장 중간에 끝나면, 문장 단위로 조정
            if end < len(text) and text[end] != '.':
                sentence_end = text.find('.', end)
                if sentence_end != -1:
                    chunk = text[start:sentence_end + 1]
                    end = sentence_end + 1

            chunks.append(chunk)
            start = max(0, end - self.overlap)

        return chunks
Enter fullscreen mode Exit fullscreen mode

3. 임베딩 모델 선택 및 비교

3.1 모델 성능 비교

from sentence_transformers import SentenceTransformer
import time
import numpy as np

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            "all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2"),
            "all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2"),
            "gte-small": SentenceTransformer("sentence-transformers/gte-small")
        }

    def benchmark_model(self, model_name, texts):
        model = self.models[model_name]
        start_time = time.time()
        embeddings = model.encode(texts)
        end_time = time.time()

        return {
            "model": model_name,
            "time": end_time - start_time,
            "avg_time_per_text": (end_time - start_time) / len(texts),
            "embedding_dim": embeddings.shape[1]
        }

# 사용 예시
benchmark = EmbeddingBenchmark()
texts = ["This is a sample text for embedding", "Another example text"] * 100
results = [benchmark.benchmark_model(name, texts) for name in benchmark.models.keys()]

for result in results:
    print(f"{result['model']}: {result['time']:.2f}s ({result['avg_time_per_text']:.4f}s/text)")
Enter fullscreen mode Exit fullscreen mode

4. 벡터 데이터베이스 비교

4.1 Chroma vs Qdrant vs pgvector

# Chroma 예시
import chromadb
from chromadb.config import Settings

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = chromadb.Client(Settings(chroma_db_impl="duckdb", 
                                             persist_directory="chroma_db"))
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, ids):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=ids
        )

    def search(self, query_embedding, n_results=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        return results

# Qdrant 예시
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": 384, "distance": "Cosine"}
        )

    def add_documents(self, documents, embeddings, ids):
        self.client.upsert(
            collection_name=self.collection_name,
            points=[{
                "id": i,
                "vector": emb.tolist(),
                "payload": {"text": doc}
            } for i, (doc, emb) in enumerate(zip(documents, embeddings))]
        )

    def search(self, query_embedding, limit=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding.tolist(),
            limit=limit
        )
        return [hit.payload["text"] for hit in results]

# pgvector 예시
import psycopg2
from psycopg2.extras import Json

class PGVectorDB:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS embeddings (
                    id SERIAL PRIMARY KEY,
                    text TEXT,
                    embedding VECTOR(384)
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding ON embeddings USING ivfflat (embedding vector_cosine_ops)
            """)
        self.conn.commit()

    def add_documents(self, documents, embeddings):
        with self.conn.cursor() as cur:
            for doc, emb in zip(documents, embeddings):
                cur.execute(
                    "INSERT INTO embeddings (text, embedding) VALUES (%s, %s)",
                    (doc, emb.tolist())
                )
        self.conn.commit()

    def search(self, query_embedding, limit=5):
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT text FROM embeddings 
                ORDER BY embedding <-> %s 
                LIMIT %s
            """, (query_embedding.tolist(), limit))
            return [row[0] for row in cur.fetchall()]
Enter fullscreen mode Exit fullscreen mode

5. RAG 파이프라인 완전 구현


python
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np

class SimpleRAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.embedder = SentenceTransformer(model_name)
        self.chromadb_client = Client()
        self.collection = self.chromadb_client.get_or_create_collection("documents")

    def setup_pipeline(self, documents):
        """문서를 임베딩하고 벡터 데이터베이스에 저장"""
        embeddings = self.embedder.encode(documents)
        self.collection.add(
            documents=documents,
            embeddings=embeddings.tolist(),
            ids=[str(i) for i in range(len(documents))]
        )

    def retrieve(self, query, top_k=3):
        """쿼리에 대한 관련 문서 검색"""
        query_embedding = self.embedder.encode([query])
        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=top_k
        )
        return results['documents'][0] if results['documents'] else []

    def generate_response(self, query, retrieved_docs):
        """LLM을 통한 응답 생성 (예시)"""
        context = "\n\n".join(retrieved_docs)
        prompt = f"다음 문맥을 바탕으로 질문에 답하세요:\n\n{context}\n\n질문: {query}"

        # 실제 LLM 호출 예시 (예: using HuggingFace)
        # response = pipeline(prompt, max_length=200, do_sample=False)
        # return response[0]['generated_text']

        return f"질문: {query}\n검색된 문맥: {context[:100]}..."

# 사용

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)