DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v24)

RAG 시스템 실전 구축 (v24)

1. RAG Fundamentals: The Three-Step Loop

Retrieval-Augmented Generation (RAG) combines the strengths of retrieval systems and generative models. The core loop consists of:

  1. Retrieval: Find relevant documents from a knowledge base
  2. Augmentation: Combine retrieved documents with the query
  3. Generation: Produce a response using the augmented context
# Basic RAG pipeline structure
class BasicRAG:
    def __init__(self, embedder, vector_db, generator):
        self.embedder = embedder
        self.vector_db = vector_db
        self.generator = generator

    def process_query(self, query):
        # Step 1: Embed query
        query_embedding = self.embedder.embed([query])[0]

        # Step 2: Retrieve relevant documents
        retrieved_docs = self.vector_db.search(query_embedding, k=5)

        # Step 3: Generate response
        context = " ".join([doc['content'] for doc in retrieved_docs])
        response = self.generator.generate(query, context)
        return response
Enter fullscreen mode Exit fullscreen mode

2. Chunking Strategies: Finding the Right Balance

Semantic Chunking: Uses sentence transformers to identify natural breaks in text.

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_text(self, text, min_chunk_size=100):
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # Use KMeans to cluster sentences by semantic similarity
        n_clusters = max(1, len(sentences) // 3)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(embeddings)

        chunks = []
        current_chunk = []
        current_cluster = clusters[0]

        for i, (sentence, cluster) in enumerate(zip(sentences, clusters)):
            if cluster != current_cluster and len(current_chunk) > 0:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_cluster = cluster

            current_chunk.append(sentence)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return [chunk for chunk in chunks if len(chunk) >= min_chunk_size]

# Example usage
chunker = SemanticChunker()
text = "Large language models are powerful AI systems. They can understand context and generate human-like text. Training these models requires significant computational resources."
chunks = chunker.chunk_text(text)
print(f"Created {len(chunks)} chunks")
Enter fullscreen mode Exit fullscreen mode

Recursive Chunking: Breaks down text recursively with overlap.

class RecursiveChunker:
    def __init__(self, chunk_size=512, overlap=64):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text):
        chunks = []
        start = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk = text[start:end]
            chunks.append(chunk)
            start = end - self.overlap

        return chunks

# Benchmark comparison
def benchmark_chunking_methods():
    sample_text = "This is a test document. It contains multiple sentences. Each sentence should be properly chunked for optimal retrieval performance. The chunking method should balance context preservation with token efficiency."

    semantic_chunker = SemanticChunker()
    recursive_chunker = RecursiveChunker()

    semantic_chunks = semantic_chunker.chunk_text(sample_text)
    recursive_chunks = recursive_chunker.chunk_text(sample_text)

    print(f"Semantic chunks: {len(semantic_chunks)}")
    print(f"Recursive chunks: {len(recursive_chunks)}")
Enter fullscreen mode Exit fullscreen mode

3. Embedding Model Selection and Comparison

Choosing the right embedding model is crucial for RAG performance.

from sentence_transformers import SentenceTransformer
import time

class EmbeddingBenchmark:
    def __init__(self):
        self.models = {
            'all-MiniLM-L6-v2': 'fast',
            'all-mpnet-base-v2': 'medium',
            'multi-qa-MiniLM-L6-v2': 'fast',
            'paraphrase-multilingual-MiniLM-v2': 'medium'
        }

    def benchmark_model(self, model_name, test_sentences, batch_size=32):
        model = SentenceTransformer(model_name)

        # Warmup
        _ = model.encode([test_sentences[0]])

        start_time = time.time()
        embeddings = model.encode(test_sentences, batch_size=batch_size)
        end_time = time.time()

        avg_time = (end_time - start_time) / len(test_sentences)

        return {
            'model': model_name,
            'avg_time_per_sentence': avg_time,
            'embedding_dim': len(embeddings[0]),
            'memory_usage': len(embeddings) * len(embeddings[0]) * 4  # float32
        }

# Quick benchmark
benchmark = EmbeddingBenchmark()
test_sentences = [
    "This is the first test sentence.",
    "This is the second test sentence.",
    "This is the third test sentence."
]

results = []
for model_name in benchmark.models:
    result = benchmark.benchmark_model(model_name, test_sentences)
    results.append(result)

    print(f"{result['model']}: {result['avg_time_per_sentence']:.4f}s per sentence")
Enter fullscreen mode Exit fullscreen mode

4. Vector Database Comparison

# Chroma Implementation
import chromadb
from chromadb.utils import embedding_functions

class ChromaVectorDB:
    def __init__(self, collection_name="rag_collection"):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )

    def add_documents(self, documents, ids):
        self.collection.add(
            documents=documents,
            ids=ids
        )

    def search(self, query_embedding, k=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )
        return [{'content': doc, 'score': score} 
                for doc, score in zip(results['documents'][0], results['distances'][0])]

# Qdrant Implementation
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorDB:
    def __init__(self, host="localhost", port=6333, collection_name="rag_collection"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

        # Create collection if it doesn't exist
        try:
            self.client.get_collection(collection_name)
        except:
            self.client.create_collection(
                collection_name=collection_name,
                vectors_config={"size": 384, "distance": "Cosine"}
            )

    def add_documents(self, documents, ids):
        points = [
            {
                "id": id,
                "vector": self._get_embedding(doc),
                "payload": {"content": doc}
            }
            for id, doc in zip(ids, documents)
        ]
        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )

    def search(self, query_embedding, k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=k
        )
        return [{'content': point.payload['content'], 'score': point.score} 
                for point in results]

# Performance benchmark
def compare_vector_dbs():
    # Sample data
    documents = [
        "Machine learning models require large datasets for training.",
        "Deep learning uses neural networks with multiple layers.",
        "Natural language processing helps computers understand text.",
        "Computer vision enables machines to interpret visual information."
    ]

    # Test Chroma
    chroma_db = ChromaVectorDB("chroma_test")
    chroma_db.add_documents(documents, [f"doc_{i}" for i in range(len(documents))])

    # Test Qdrant (requires Qdrant server running)
    # qdrant_db = QdrantVectorDB()
    # qdrant_db.add_documents(documents, [f"doc_{i}" for i in range(len(documents))])

    print("Vector DB comparison completed")
Enter fullscreen mode Exit fullscreen mode

5. Full RAG Pipeline from Scratch


python
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.utils import embedding_functions
import json

class CompleteRAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        # Initialize components
        self.embedder = SentenceTransformer(model_name)
        self.client = Client()
        self.collection = self.client.get_or_create_collection(
            name="rag_knowledge_base",
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )
        self.context_window = 2048  # Max tokens for context

    def add_documents(self, documents, ids, metadata=None):
        """Add documents to the knowledge base"""
        self.collection.add(
            documents=documents,
            ids=ids,
            metadatas=metadata or [{}] * len(documents)
        )

    def retrieve_documents(self, query, k=5):
        """Retrieve relevant documents"""
        query_embedding = self.embed

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)