DEV Community

matias yoon
matias yoon

Posted on

RAG 시스템 실전 구축 (v7)

RAG 시스템 실전 구축 (v7)

Practical Guide for ML Engineers & Backend Developers

1. RAG Fundamentals: The Retrieval-Augmentation-Generation Loop

Retrieval-Augmentation-Generation (RAG) systems work in a three-step loop:

  1. Retrieval: Find relevant documents from a knowledge base
  2. Augmentation: Combine retrieved documents with the original query
  3. Generation: Use LLM to produce a final response

The core loop looks like this:

def rag_pipeline(query, vector_db, llm):
    # 1. Retrieval
    relevant_docs = vector_db.search(query, k=5)

    # 2. Augmentation
    augmented_prompt = format_prompt(query, relevant_docs)

    # 3. Generation
    response = llm.generate(augmented_prompt)
    return response
Enter fullscreen mode Exit fullscreen mode

This simple structure handles most real-world applications with proper chunking and embedding strategies.

2. Chunking Strategies

Semantic Chunking (Recommended for most use cases)

import tiktoken
from sentence_transformers import SentenceTransformer
from typing import List, Tuple

def semantic_chunking(text: str, model: SentenceTransformer, max_tokens: int = 512) -> List[str]:
    """Chunk text based on semantic boundaries"""
    # Split by paragraphs first
    paragraphs = text.split('\n\n')
    chunks = []

    current_chunk = []
    current_length = 0

    for para in paragraphs:
        para_tokens = len(tiktoken.encoding_for_model("gpt-4").encode(para))
        if current_length + para_tokens > max_tokens and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [para]
            current_length = para_tokens
        else:
            current_chunk.append(para)
            current_length += para_tokens

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks
Enter fullscreen mode Exit fullscreen mode

Recursive Chunking (for documents with clear structure)

def recursive_chunking(text: str, max_size: int = 1000) -> List[str]:
    """Recursively split text by headings and sentences"""
    import re

    # Split by common delimiters
    parts = re.split(r'(\n\s*\n)', text)
    chunks = []

    for part in parts:
        if len(part) > max_size:
            # Recursively split larger parts
            sub_chunks = recursive_chunking(part, max_size)
            chunks.extend(sub_chunks)
        elif part.strip():
            chunks.append(part.strip())

    return chunks
Enter fullscreen mode Exit fullscreen mode

3. Embedding Model Selection and Comparison

from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
import numpy as np

class EmbeddingModel:
    def __init__(self, model_name: str):
        if model_name == "all-MiniLM-L6-v2":
            self.model = SentenceTransformer("all-MiniLM-L6-v2")
        elif model_name == "instructor-large":
            self.model = INSTRUCTOR("hkunlp/instructor-large")
        elif model_name == "gte-small":
            self.model = SentenceTransformer("sentence-transformers/gte-small")

    def encode(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts)

    def get_dimension(self) -> int:
        return self.model.get_sentence_embedding_dimension()

# Benchmark comparison
def benchmark_embeddings(models: List[str], test_texts: List[str]):
    results = {}
    for model_name in models:
        model = EmbeddingModel(model_name)
        start_time = time.time()
        embeddings = model.encode(test_texts)
        end_time = time.time()

        results[model_name] = {
            "time": end_time - start_time,
            "dimension": model.get_dimension(),
            "size_mb": len(embeddings) * model.get_dimension() * 4 / (1024*1024)
        }
    return results
Enter fullscreen mode Exit fullscreen mode

Model Comparison (for 1000 texts):

  • all-MiniLM-L6-v2: Fast (0.5s), 384d, 1.2MB
  • gte-small: Balanced (0.8s), 384d, 1.2MB
  • instructor-large: Slow (1.5s), 768d, 2.8MB

Recommendation: Use all-MiniLM-L6-v2 for most applications with good speed/quality balance.

4. Vector Database Comparison

# Chroma Client
import chromadb
from chromadb import Client

class ChromaVectorDB:
    def __init__(self, path: str):
        self.client = Client(path)
        self.collection = self.client.get_or_create_collection("documents")

    def add_documents(self, documents: List[str], ids: List[str]):
        embeddings = self.embedder.encode(documents)
        self.collection.add(
            embeddings=embeddings,
            documents=documents,
            ids=ids
        )

    def search(self, query: str, k: int = 5) -> List[dict]:
        query_embedding = self.embedder.encode([query])[0]
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k,
            include=['documents', 'distances']
        )
        return results['documents'][0]

# Qdrant Client
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition

class QdrantVectorDB:
    def __init__(self, host: str, port: int):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = "documents"

    def add_documents(self, documents: List[str], ids: List[str]):
        points = [
            {
                "id": i,
                "vector": self.embedder.encode([doc])[0],
                "payload": {"text": doc}
            } for i, doc in enumerate(documents)
        ]
        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )

    def search(self, query: str, k: int = 5) -> List[dict]:
        query_vector = self.embedder.encode([query])[0]
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=k
        )
        return [hit.payload['text'] for hit in results]

# pgvector with PostgreSQL
import psycopg2
from psycopg2.extras import Json

class PgVectorDB:
    def __init__(self, connection_string: str):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS documents (
                    id SERIAL PRIMARY KEY,
                    content TEXT,
                    embedding VECTOR(384),
                    metadata JSONB
                )
            """)
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_embedding 
                ON documents USING ivfflat (embedding vector_l2_ops)
            """)
        self.conn.commit()

    def add_documents(self, documents: List[str], metadata: List[dict]):
        with self.conn.cursor() as cur:
            for doc, meta in zip(documents, metadata):
                embedding = self.embedder.encode([doc])[0]
                cur.execute(
                    "INSERT INTO documents (content, embedding, metadata) VALUES (%s, %s, %s)",
                    (doc, embedding.tolist(), Json(meta))
                )
        self.conn.commit()

    def search(self, query: str, k: int = 5) -> List[dict]:
        query_embedding = self.embedder.encode([query])[0]
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT content, metadata FROM documents 
                ORDER BY embedding <-> %s 
                LIMIT %s
            """, (query_embedding.tolist(), k))
            results = cur.fetchall()
        return [r[0] for r in results]

# Benchmark Results:
# Chroma: Fastest (0.1s), Local-only, good for dev
# Qdrant: Best for production, distributed, good performance
# PgVector: Most scalable, best for large datasets
# Milvus: Highest performance, best for enterprise
Enter fullscreen mode Exit fullscreen mode

5. Full RAG Pipeline Code from Scratch


python
import os
import json
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb import Client

class RAGSystem:
    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.client = Client()
        self.collection = self.client.get_or_create_collection("docs")

    def add_documents(self, documents: List[Dict[str, Any]]):
        """Add documents to the system"""
        texts = [doc['content'] for doc in documents]
        ids = [doc['id'] for doc in documents]

        # Chunk documents
        chunked_texts = []
        chunked_ids = []
        for doc, doc_id in zip(documents, ids):
            chunks = semantic_chunking(doc['content'], self.embedding_model)
            for i, chunk in enumerate(chunks):
                chunked_texts.append(chunk)
                chunked_ids.append(f"{doc_id}_chunk_{i}")

        # Create embeddings
        embeddings = self.embedding_model.encode(chunked_texts)

        # Store in vector DB
        self.collection.add(
            embeddings

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)