Introduction
Enterprise RAG (Retrieval-Augmented Generation) is an architecture that combines enterprise data retrieval systems with large language models to generate context-aware, grounded responses.
This guide shows you how to build production-grade RAG systems with security and compliance built-in.
Architecture Overview
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
│ User Query │────▶│ Embedding │────▶│ Vector │
└─────────────┘ │ Model │ │ Database │
└──────────────┘ └──────┬──────┘
│
▼
┌──────────────┐ ┌──────────────┐
│ LLM │◀────│ Permission │
│ Generation │ │ Filter │
└──────┬───────┘ └──────────────┘
│
▼
┌──────────────┐
│ Response │
│ + Sources │
└──────────────┘
Implementation
Step 1: Document Ingestion
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
client = OpenAI()
# Chunk documents semantically
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
separators=["\n\n", "\n", ".", "!","?", ",", " ", ""]
)
chunks = text_splitter.split_text(document.content)
# Generate embeddings
embeddings = client.embeddings.create(
model="text-embedding-ada-002",
input=chunks
)
# Store with metadata
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings.data)):
vector_db.insert(
id=f"{document.id}_{i}",
vector=embedding.embedding,
metadata={
"document_id": document.id,
"permissions": document.permissions, # Critical for security
"classification": document.classification,
"source": document.url
}
)
Step 2: Retrieval with Permission Filtering
def retrieve_with_permissions(query: str, user: User, top_k: int = 5):
# Embed query
query_embedding = client.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# Search vector DB with permission filter
results = vector_db.search(
query_embedding=query_embedding,
filter={
"permissions": {"$in": user.roles}, # Only docs user can access
"classification": {"$lte": user.clearance_level}
},
top_k=top_k
)
# Log retrieval for audit
audit_log.record({
"user_id": user.id,
"query": query,
"documents_accessed": [r.metadata["document_id"] for r in results],
"timestamp": datetime.utcnow()
})
return results
Step 3: LLM Generation with Citations
def generate_response(query: str, retrieved_chunks: List[Chunk]):
# Construct prompt with sources
context = "\n\n".join([
f"Source {i+1} ({chunk.metadata['source']}):\n{chunk.text}"
for i, chunk in enumerate(retrieved_chunks)
])
prompt = f"""Answer the question using ONLY the provided context.
You MUST cite sources using [Source N] notation.
If the answer is not in the context, say "I don't have enough information."
Context:
{context}
Question: {query}
Answer:"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.1 # Low temp for factual responses
)
return response.choices[0].message.content
Security Best Practices
- Encrypt vectors in database (AES-256)
- Filter by permissions before sending to LLM
- Audit all queries with immutable logs
- Scan responses for PII leakage
- Rate limit to prevent data exfiltration
Performance Optimization
# Use re-ranking for better relevance
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
def rerank_results(query: str, results: List[Chunk], top_k: int = 5):
# Score query-chunk pairs
pairs = [[query, chunk.text] for chunk in results]
scores = reranker.predict(pairs)
# Return top-k after re-ranking
ranked = sorted(zip(results, scores), key=lambda x: x[1], reverse=True)
return [chunk for chunk, score in ranked[:top_k]]
Top comments (0)