DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v5)

RAG 시스템 실전 구축 (v5)

Practical Guide for ML Engineers & Backend Developers

Product: $5 RAG Pipeline Starter Kit with 500+ Code Examples

1. RAG Fundamentals: The Retrieval-Augmentation-Generation Loop

Retriever-Augmenter-Generator (RAG) systems work in three phases:

# Basic RAG workflow
def rag_pipeline(query, vector_db, llm):
    # 1. RETRIEVAL: Find relevant documents
    relevant_docs = vector_db.search(query, top_k=5)

    # 2. AUGMENTATION: Combine context with query
    augmented_prompt = format_prompt(query, relevant_docs)

    # 3. GENERATION: Generate response
    response = llm.generate(augmented_prompt)
    return response
Enter fullscreen mode Exit fullscreen mode

Key components:

  • Retriever: Finds relevant documents (vector similarity search)
  • Augmenter: Formats context into prompt
  • Generator: LLM produces final answer

2. Chunking Strategies: Breaking Down Documents Efficiently

Semantic Chunking (Best for code understanding):

import tiktoken
from sentence_transformers import SentenceTransformer

def semantic_chunking(text, model, max_tokens=512):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # Estimate token count
        if len(tokenize(current_chunk + sentence)) > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
        else:
            current_chunk += sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks
Enter fullscreen mode Exit fullscreen mode

Recursive Chunking (For structured data):

def recursive_chunking(text, chunk_size=1000):
    """Split text recursively by delimiters"""
    delimiters = ['\n\n', '\n', '. ', ' ', '']
    chunks = [text]

    for delim in delimiters:
        new_chunks = []
        for chunk in chunks:
            if len(chunk) > chunk_size:
                new_chunks.extend(chunk.split(delim))
            else:
                new_chunks.append(chunk)
        chunks = new_chunks
        if all(len(chunk) <= chunk_size for chunk in chunks):
            break

    return [chunk for chunk in chunks if len(chunk) > 10]
Enter fullscreen mode Exit fullscreen mode

3. Embedding Model Selection and Comparison

Top 5 Models for Code RAG:

# Model comparison benchmark
models = {
    "all-MiniLM-L6-v2": {
        "dimensions": 384,
        "size_mb": 100,
        "speed": "fast",
        "code_sensitivity": "high"
    },
    "Sentence-BERT": {
        "dimensions": 768,
        "size_mb": 400,
        "speed": "medium",
        "code_sensitivity": "very_high"
    },
    "CodeBERT": {
        "dimensions": 768,
        "size_mb": 1000,
        "speed": "medium",
        "code_sensitivity": "excellent"
    },
    "MiniLM": {
        "dimensions": 384,
        "size_mb": 50,
        "speed": "fast",
        "code_sensitivity": "medium"
    },
    "MPNet": {
        "dimensions": 768,
        "size_mb": 400,
        "speed": "slow",
        "code_sensitivity": "high"
    }
}

# Implementation example
class EmbeddingService:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def encode(self, texts):
        return self.model.encode(texts)

    def get_dimensions(self):
        return self.model.get_sentence_embedding_dimension()

# Benchmark function
def benchmark_embeddings():
    service = EmbeddingService()
    texts = ["Sample code snippet"] * 1000
    import time

    start = time.time()
    embeddings = service.encode(texts)
    end = time.time()

    print(f"Time for 1000 embeddings: {end-start:.2f}s")
    return embeddings
Enter fullscreen mode Exit fullscreen mode

4. Vector Database Comparison

Database Speed Memory Cost Best For
Chroma Fast Low Free Development
Qdrant Fast Medium Free Production
pgvector Medium High Free PostgreSQL users
Milvus Fast High Free Large-scale
# Chroma implementation
import chromadb

class ChromaVectorStore:
    def __init__(self, collection_name="code_docs"):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            ids=[str(i) for i in range(len(documents))]
        )

    def search(self, query, top_k=5):
        result = self.collection.query(
            query_texts=[query],
            n_results=top_k
        )
        return result['documents'][0]

# Qdrant implementation
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition

class QdrantVectorStore:
    def __init__(self, host="localhost", port=6333, collection_name="code_docs"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

    def search(self, query_vector, top_k=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=top_k
        )
        return [hit.payload['text'] for hit in results]
Enter fullscreen mode Exit fullscreen mode

5. Full RAG Pipeline from Scratch

import os
import json
from pathlib import Path
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client

class CodeRAGPipeline:
    def __init__(self, embedding_model="all-MiniLM-L6-v2", db_path="./chroma_db"):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_or_create_collection("code_documents")
        self.chunk_size = 1000

    def preprocess_code_files(self, file_path: str) -> List[str]:
        """Extract code chunks from file"""
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Simple chunking by function/class boundaries
        lines = content.split('\n')
        chunks = []
        current_chunk = []

        for line in lines:
            if line.strip().startswith(('def ', 'class ')) and current_chunk:
                chunks.append('\n'.join(current_chunk))
                current_chunk = [line]
            else:
                current_chunk.append(line)

        if current_chunk:
            chunks.append('\n'.join(current_chunk))

        return chunks

    def embed_and_store(self, documents: List[str], file_path: str):
        """Generate embeddings and store in vector DB"""
        embeddings = self.embedding_model.encode(documents)

        self.collection.add(
            documents=documents,
            embeddings=embeddings.tolist(),
            ids=[f"{file_path}_{i}" for i in range(len(documents))]
        )

    def query(self, query: str, top_k: int = 5) -> List[Dict]:
        """Query the RAG system"""
        query_embedding = self.embedding_model.encode([query])

        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=top_k
        )

        return [
            {
                'text': doc,
                'distance': dist
            }
            for doc, dist in zip(results['documents'][0], results['distances'][0])
        ]

    def generate_response(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Generate final response using LLM"""
        context = "\n\n".join([doc['text'] for doc in retrieved_docs[:3]])
        prompt = f"""
        Context: {context}

        Question: {query}

        Answer:
        """
        # This would integrate with your LLM (e.g., llama.cpp)
        return f"Based on {len(retrieved_docs)} documents, the answer is..."
Enter fullscreen mode Exit fullscreen mode

6. Advanced Techniques: Query Transformation & Hybrid Search

Query Transformation:


python
def transform_query(original_query: str) -> List[str]:
    """Generate multiple query variations"""
    transformations = [
        original_query,
        f"Explain {original_query}",
        f"How to implement {original_query}",
        f"Code example for {original_query}",
        f"Best practices for {original_query}"
    ]
    return transformations

class HybridSearchRAG:
    def __init__(self):
        self.vector_store = ChromaVectorStore()
        self.text_store = TextSearchEngine()

    def hybrid_search(self, query: str, top_k: int = 5):
        """Combine vector and keyword search"""
        # Vector search
        vector_results = self.vector_store.search(query, top_k)

        # Keyword search (simplified)
        keyword_results = self.text_store.search(query, top_k)

        # Combine results with weighted scoring

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)