Building a Multi-Model AI Router in Python with Novastack 🚀
When you’re building production AI applications, managing multiple API keys, endpoints, and client libraries for different LLM providers becomes a maintenance nightmare. Today I’ll show you how to unify access to three powerhouse models through a single endpoint using Novastack’s token-forwarding platform.
Why a Multi-Model Router Matters
Most AI applications benefit from using different models for different tasks. Maybe DeepSeek-V4-Pro handles your code generation while Claude-Opus-4.7 manages creative writing. But traditional setups require:
- Separate API keys for each provider
- Different client libraries and authentication methods
- Manual failover logic when a provider goes down
- Complex routing logic scattered throughout your codebase
Novastack solves this elegantly by providing a single OpenAI-compatible endpoint that routes requests to the appropriate model based on your model parameter. One key, one endpoint, three models.
Setting Up Our Router
Let’s build a production-grade model router that intelligently directs requests while maintaining clean error handling and fallback capabilities.
Prerequisites
pip install openai httpx
Get your API key from https://novapai.ai, then configure your environment:
import os
from typing import Optional, Dict, Any, List
from openai import OpenAI
# Novastack configuration
NOVASTACK_API_KEY = os.getenv("NOVASTACK_API_KEY")
NOVASTACK_BASE_URL = "https://api.novapai.ai/router/v1"
# Available models
AVAILABLE_MODELS = {
"qwen": "Qwen3-235B-A22B",
"deepseek": "DeepSeek-V4-Pro",
"claude": "Claude-Opus-4.7"
}
Core Router Implementation
Here’s where the magic happens. We’re building a client wrapper that selects models based on task requirements while maintaining a consistent interface:
class NovastackRouter:
def __init__(self, api_key: str = NOVASTACK_API_KEY):
self.client = OpenAI(
api_key=api_key,
base_url=NOVASTACK_BASE_URL
)
self.default_model = AVAILABLE_MODELS["qwen"]
def route_by_complexity(self, task: str) -> str:
"""Intelligent model selection based on task requirements"""
complexity_indicators = {
"code_review": ["debug", "refactor", "code review", "optimize"],
"creative": ["story", "poem", "creative", "brainstorm"],
"analysis": ["analyze", "summarize", "research", "explain"]
}
task_lower = task.lower()
if any(indicator in task_lower for indicator in complexity_indicators["code_review"]):
return AVAILABLE_MODELS["deepseek"]
elif any(indicator in task_lower for indicator in complexity_indicators["creative"]):
return AVAILABLE_MODELS["claude"]
else:
return AVAILABLE_MODELS["qwen"]
def chat(
self,
messages: List[Dict[str, str]],
model: Optional[str] = None,
auto_route: bool = False,
**kwargs
) -> Dict[str, Any]:
"""Main chat interface with optional auto-routing"""
if auto_route and messages:
# Extract user's last message for routing
user_prompt = next(
(msg["content"] for msg in reversed(messages) if msg["role"] == "user"),
""
)
selected_model = self.route_by_complexity(user_prompt)
else:
selected_model = model or self.default_model
try:
# Request parameters optimized for stability
params = {
"model": selected_model,
"messages": messages,
"max_tokens": kwargs.get("max_tokens", 4096),
"temperature": kwargs.get("temperature", 0.7),
"stream": kwargs.get("stream", False),
}
response = self.client.chat.completions.create(**params)
return self._format_response(response, selected_model)
except Exception as e:
return self._handle_failure(e, messages)
def _format_response(self, response, model_used: str) -> Dict[str, Any]:
"""Normalize response format across different models"""
return {
"content": response.choices[0].message.content,
"model_used": model_used,
"usage": {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens
},
"finish_reason": response.choices[0].finish_reason
}
def _handle_failure(self, error: Exception, messages: List[Dict[str, str]]) -> Dict[str, Any]:
"""Fallback mechanism when primary model fails"""
# Attempt fallback to Qwen model
try:
response = self.client.chat.completions.create(
model=AVAILABLE_MODELS["qwen"],
messages=messages
)
return {
**self._format_response(response, AVAILABLE_MODELS["qwen"]),
"fallback_triggered": True
}
except:
raise Exception(f"All models failed: {str(error)}")
Production-Ready Usage Patterns
Now let’s put this router through its paces with real-world scenarios:
Scenario 1: Smart Task Routing
router = NovastackRouter()
# This will automatically route to DeepSeek for code tasks
code_response = router.chat(
messages=[{
"role": "user",
"content": "Debug this Python function that's causing a memory leak"
}],
auto_route=True
)
print(f"Routed to: {code_response['model_used']}")
print(f"Solution: {code_response['content'][:200]}...")
Scenario 2: Explicit Model Selection with Streaming
# Manual selection of Claude for creative writing
stream_response = router.client.chat.completions.create(
model=AVAILABLE_MODELS["claude"],
messages=[{
"role": "user",
"content": "Write a short story about an AI discovering emotions"
}],
stream=True,
max_tokens=500,
temperature=0.9 # Higher creativity
)
for chunk in stream_response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
Scenario 3: Parallel Multi-Model Analysis
python
from concurrent.futures import ThreadPoolExecutor
import time
def analyze_with_model(model_name: str, prompt: str):
router =
Top comments (0)