DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v38)

RAG 시스템 실전 구축 (v38)

Real-World RAG Implementation Guide for ML Engineers

1. RAG Fundamentals: The Core Loop

Retrieval-Augmented Generation (RAG) is a powerful pattern that combines information retrieval with language generation. The core loop consists of three phases:

  1. Retrieval: Find relevant documents from a knowledge base
  2. Augmentation: Inject retrieved context into prompts
  3. Generation: Generate responses using the augmented prompt
# Simplified RAG Loop
class BasicRAG:
    def __init__(self, vector_db, embedding_model, llm):
        self.vector_db = vector_db
        self.embedding_model = embedding_model
        self.llm = llm

    def query(self, user_query):
        # 1. Retrieve relevant documents
        query_embedding = self.embedding_model.encode(user_query)
        relevant_docs = self.vector_db.search(query_embedding, k=5)

        # 2. Augment prompt with context
        context = "\n".join([doc.content for doc in relevant_docs])
        augmented_prompt = f"Context: {context}\n\nQuestion: {user_query}"

        # 3. Generate response
        response = self.llm.generate(augmented_prompt)
        return response
Enter fullscreen mode Exit fullscreen mode

2. Chunking Strategies

Effective document chunking is critical for retrieval quality. Here are the main approaches:

Semantic Chunking

import numpy as np
from sentence_transformers import SentenceTransformer

class SemanticChunker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def chunk_by_semantic(self, text, max_tokens=512):
        sentences = text.split('. ')
        embeddings = self.model.encode(sentences)

        # Group sentences based on semantic similarity
        chunks = []
        current_chunk = []
        current_embedding = np.zeros(embeddings[0].shape)

        for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
            if len(current_chunk) > 0:
                similarity = np.dot(current_embedding, embedding) / (
                    np.linalg.norm(current_embedding) * np.linalg.norm(embedding)
                )
                if similarity < 0.7 or len(current_chunk) > 20:  # threshold
                    chunks.append(' '.join(current_chunk))
                    current_chunk = [sentence]
                    current_embedding = embedding
                else:
                    current_chunk.append(sentence)
                    # Update average embedding
                    current_embedding = (current_embedding + embedding) / 2
            else:
                current_chunk.append(sentence)
                current_embedding = embedding

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks
Enter fullscreen mode Exit fullscreen mode

Recursive Chunking

class RecursiveChunker:
    def __init__(self, max_chunk_size=512, overlap=50):
        self.max_chunk_size = max_chunk_size
        self.overlap = overlap

    def chunk_recursive(self, text):
        chunks = []

        def split_recursive(text, start=0, depth=0):
            if len(text) <= self.max_chunk_size or depth > 5:
                chunks.append(text)
                return

            # Try to split at sentence boundaries first
            split_point = text.rfind('. ', start, start + self.max_chunk_size)
            if split_point == -1:
                split_point = start + self.max_chunk_size

            chunks.append(text[start:split_point])
            next_start = max(0, split_point - self.overlap)
            split_recursive(text, next_start, depth + 1)

        split_recursive(text)
        return chunks
Enter fullscreen mode Exit fullscreen mode

3. Embedding Model Selection

Choosing the right embedding model affects both performance and cost:

# Model comparison benchmark
import time
from sentence_transformers import SentenceTransformer

def benchmark_embeddings():
    models = {
        "all-MiniLM-L6-v2": {
            "dimensions": 384,
            "size_mb": 80,
            "speed": "fast"
        },
        "all-mpnet-base-v2": {
            "dimensions": 768,
            "size_mb": 400,
            "speed": "medium"
        },
        "BAAI/bge-small-en": {
            "dimensions": 512,
            "size_mb": 120,
            "speed": "fast"
        }
    }

    test_sentences = [
        "The quick brown fox jumps over the lazy dog",
        "Machine learning models require large datasets",
        "Natural language processing enables human-like interactions"
    ]

    for name, config in models.items():
        model = SentenceTransformer(name)
        start = time.time()
        embeddings = model.encode(test_sentences)
        end = time.time()

        print(f"{name}: {end-start:.2f}s for {len(test_sentences)} sentences")
        print(f"  Dimensions: {config['dimensions']}, Size: {config['size_mb']}MB")

# Benchmark output:
# all-MiniLM-L6-v2: 0.15s for 3 sentences
# all-mpnet-base-v2: 0.35s for 3 sentences  
# BAAI/bge-small-en: 0.20s for 3 sentences
Enter fullscreen mode Exit fullscreen mode

4. Vector Database Comparison

Database Pros Cons Best For
Chroma Easy setup, Python native, good for dev Limited scalability Local/development
Qdrant High performance, advanced filtering Complex setup Production
pgvector PostgreSQL integration, ACID Requires PostgreSQL Existing SQL systems
Milvus Scalable, distributed Steep learning curve Large deployments
# Example implementation with different vector DBs
class VectorDBFactory:
    @staticmethod
    def create_vector_db(db_type, **kwargs):
        if db_type == "chroma":
            import chromadb
            client = chromadb.Client()
            return chromadb.Collection(client, **kwargs)
        elif db_type == "qdrant":
            from qdrant_client import QdrantClient
            client = QdrantClient(**kwargs)
            return client
        elif db_type == "pgvector":
            import psycopg2
            conn = psycopg2.connect(**kwargs)
            return conn
        elif db_type == "milvus":
            from pymilvus import Collection
            return Collection(**kwargs)
Enter fullscreen mode Exit fullscreen mode

5. Full RAG Pipeline Implementation

import os
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings
from typing import List, Dict
import json

class CompleteRAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2", db_path="./chroma_db"):
        # Initialize components
        self.embedding_model = SentenceTransformer(model_name)
        self.vector_client = Client(Settings(persist_directory=db_path))
        self.collection = self.vector_client.get_or_create_collection("documents")

        # Simple LLM placeholder (replace with actual implementation)
        self.llm = self._simple_llm_response

    def _simple_llm_response(self, prompt):
        # This would be replaced with actual LLM call
        return f"Generated response to: {prompt[:50]}..."

    def add_documents(self, documents: List[Dict]):
        """Add documents to the vector database"""
        embeddings = self.embedding_model.encode([doc['content'] for doc in documents])

        # Add to Chroma
        self.collection.add(
            embeddings=embeddings,
            documents=[doc['content'] for doc in documents],
            metadatas=[doc.get('metadata', {}) for doc in documents],
            ids=[doc['id'] for doc in documents]
        )

    def search_and_generate(self, query: str, top_k: int = 5):
        """Main RAG workflow"""
        # 1. Retrieve
        query_embedding = self.embedding_model.encode([query])
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=['documents', 'metadatas']
        )

        # 2. Augment
        retrieved_docs = results['documents'][0]
        context = "\n---\n".join(retrieved_docs)
        augmented_prompt = f"""
Context: {context}
Question: {query}
Answer:"""

        # 3. Generate
        response = self.llm(augmented_prompt)
        return {
            "query": query,
            "context": context,
            "response": response,
            "retrieved_docs": retrieved_docs
        }

# Usage example
pipeline = CompleteRAGPipeline()

# Add sample documents
sample_docs = [
    {
        "id": "1",
        "content": "The capital of France is Paris. Paris is known for the Eiffel Tower.",
        "metadata": {"source": "wiki"}
    },
    {
        "id": "2", 
        "content": "Machine learning is a subset of artificial intelligence that focuses on algorithms.",
        "metadata": {"source": "tech_blog"}
    }
]

pipeline.add_documents(sample_docs)
result = pipeline.search_and_generate("What is the capital of France?")
print(json.dumps(result, indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

6. Advanced Techniques

Query Transformation


python
class QueryTransformer:
    def __init__(self):
        self.transformations = [
            self.expand_query,
            self.rephrase_query,


---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)