Every LLM has a context window limit — a maximum number of tokens you can pass in a single request. Claude 3.5 Sonnet offers 200K tokens, but that's still finite. Here's how to manage context efficiently for production AI applications.
Understanding Context Window Limits
| Model | Context Window | Approximate Pages |
|---|---|---|
| Claude 3.5 Sonnet | 200K tokens | ~500 pages |
| GPT-4 Turbo | 128K tokens | ~300 pages |
| Claude 3 Opus | 200K tokens | ~500 pages |
When you exceed the limit, you get an error. When you're close, you're wasting money on tokens that add no value.
Token Estimation
`python
import re
def estimate_tokens(text: str) -> int:
"""
Rough token estimation.
~4 characters per token for English text.
"""
return len(text) // 4
def estimatetokensprecise(text: str) -> int:
"""
More precise estimation using word count.
Average English word is ~1.3 tokens.
"""
words = len(re.findall(r'\w+', text))
return int(words * 1.3)
`
Technique 1: Semantic Chunking
Split documents by meaning, not by character count:
`python
import re
def semanticchunk(text: str, maxtokens: int = 4000, overlap: int = 200) -> list[str]:
"""
Split text into semantic chunks (paragraphs).
"""
Split by double newlines (paragraphs)
paragraphs = re.split(r'\n\n+', text)
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
paratokens = estimatetokens(para)
if currenttokens + paratokens > max_tokens:
Save current chunk
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
Start new chunk with overlap
overlap_paras = []
overlap_tokens = 0
for p in reversed(current_chunk):
t = estimate_tokens(p)
if overlap_tokens + t <= overlap:
overlap_paras.insert(0, p)
overlap_tokens += t
else:
break
currentchunk = overlapparas + [para]
currenttokens = overlaptokens + para_tokens
else:
current_chunk.append(para)
currenttokens += paratokens
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
`
Technique 2: RAG — Retrieval-Augmented Generation
Don't put everything in the prompt. Retrieve only what's relevant:
`python
class SimpleRAG:
def init(self, documents: list[str], chunk_size: int = 1000):
self.chunks = self.createchunks(documents, chunk_size)
self.embeddings = self.createembeddings(self.chunks)
def retrieve(self, query: str, top_k: int = 3) -> list[str]:
"""Find most relevant chunks for query."""
queryembedding = self.embed(query)
scores = [
self.cosinesimilarity(query_embedding, e)
for e in self.embeddings
]
top_indices = sorted(range(len(scores)),
key=lambda i: scores[i],
reverse=True)[:top_k]
return [self.chunks[i] for i in top_indices]
def createchunks(self, documents: list[str], chunk_size: int) -> list[str]:
chunks = []
for doc in documents:
chunks.extend(semanticchunk(doc, maxtokens=chunk_size))
return chunks
def _embed(self, text: str) -> list[float]:
In production, use OpenAI or ofox.ai embeddings
pass
`
Technique 3: Conversation Summary
Summarize older messages to preserve context:
`python
class SummarizingConversation:
def init(self, max_tokens: int = 16000):
self.maxtokens = maxtokens
self.messages = []
self.summary = ""
def add_message(self, role: str, content: str):
self.messages.append({"role": role, "content": content})
self.maybesummarize()
def maybesummarize(self):
totaltokens = sum(estimatetokens(m["content"]) for m in self.messages)
if totaltokens > self.maxtokens:
Summarize older messages
older_messages = self.messages[:-5] # Keep last 5
recent = self.messages[-5:]
summary_prompt = f"""
Summarize this conversation concisely, preserving key information:
{chr(10).join(f'{m[\"role\"]}: {m[\"content\"]}' for m in older_messages)}
"""
Call LLM to summarize (pseudocode)
self.summary = callllmsummarize(summary_prompt)
self.messages = [{"role": "system", "content": f"Prior context: {self.summary}"}] + recent
def get_messages(self) -> list[dict]:
return self.messages
`
Technique 4: System Prompt Optimization
Keep system prompts lean:
`python
❌ Verbose system prompt (wastes tokens)
verbose_system = """
You are a helpful AI assistant. You are designed to be respectful,
professional, and helpful. You should provide accurate information
and be honest when you don't know something. You should ...
[200 more words]
"""
✅ Lean system prompt (effective)
lean_system = """
Role: helpful AI assistant
Goal: provide accurate, concise answers
When unsure: say "I don't know"
"""
`
Technique 5: Streaming with Token Tracking
`python
class StreamingTokenTracker:
def init(self, model: str = "claude-3-5-sonnet-20241022"):
self.model = model
self.totalinputtokens = 0
self.totaloutputtokens = 0
async def stream_chat(self, messages: list[dict]) -> str:
"""Stream response while tracking token usage."""
response = await fetch('https://api.ofox.ai/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': f'Bearer {API_KEY}',
'Content-Type': 'application/json'
},
body: json.dumps({
'model': self.model,
'messages': messages,
'stream': True
})
})
reader = response.body.getReader()
decoder = TextDecoder()
full_response = []
while True:
chunk = await reader.read()
if chunk.done: break
data = decoder.decode(chunk.value)
for line in data.split('\n'):
if line.startswith('data: '):
delta = json.loads(line[6:]).get('choices', [{}])[0].get('delta', {})
if content := delta.get('content'):
self.totaloutputtokens += estimate_tokens(content)
yield content
Track input tokens
self.totalinputtokens = sum(
estimate_tokens(m['content']) for m in messages
)
def getcost(self, inputcostper1k=0.003, outputcostper_1k=0.015):
inputcost = (self.totalinputtokens / 1000) * inputcostper1k
outputcost = (self.totaloutputtokens / 1000) * outputcostper1k
return inputcost + outputcost
`
Practical Rule of Thumb
Keep your prompt at < 50% of context window.
This leaves room for:
User input variations
Model reasoning
Unexpected response length
Getting Started
Build token-efficient AI applications with ofox.ai — their OpenAI-compatible API gives you access to Claude with generous context windows at competitive pricing.
👉 Get started with ofox.ai
This article contains affiliate links.
Tags: llm,artificial-intelligence,programming,developer,performance
Canonical URL: https://dev.to/zny10289
Top comments (0)