This article was originally published on AI Study Room. For the full version with working code examples and related articles, visit the original post.
LLM Version Management: Model Registry, A/B Testing, Rollback
Introduction
LLMs change frequently: new model releases, fine-tuned versions, updated system prompts, and modified retrieval pipelines all constitute "versions" of your AI system. Unlike traditional software where you can pin a dependency version, LLM behavior shifts with each API update. This article covers the tools and practices for managing LLM versions in production.
Model Registry
A model registry tracks metadata for every deployed model version:
from datetime import datetime
from enum import Enum
import json
class ModelStatus(Enum):
STAGING = "staging"
CANARY = "canary"
PRODUCTION = "production"
ROLLED_BACK = "rolled_back"
DEPRECATED = "deprecated"
class ModelRegistry:
def __init__(self, storage_backend):
self.storage = storage_backend
def register_model(self, model_id: str, metadata: dict) -> dict:
entry = {
"model_id": model_id,
"provider": metadata.get("provider"),
"version": metadata.get("version"),
"description": metadata.get("description"),
"parameters": metadata.get("parameters", {}),
"system_prompt_hash": metadata.get("system_prompt_hash"),
"registered_at": datetime.now().isoformat(),
"status": ModelStatus.STAGING.value,
"evaluation_scores": {},
"deployment_history": [],
}
self.storage.save(f"models/{model_id}", entry)
return entry
def promote(self, model_id: str, target_status: ModelStatus):
entry = self.storage.load(f"models/{model_id}")
entry["status"] = target_status.value
entry["deployment_history"].append({
"action": f"promoted_to_{target_status.value}",
"timestamp": datetime.now().isoformat(),
})
self.storage.save(f"models/{model_id}", entry)
def get_active_model(self) -> dict:
"""Get the current production model."""
all_models = self.storage.load_all("models/*")
for model in sorted(all_models, key=lambda m: m["registered_at"], reverse=True):
if model["status"] == ModelStatus.PRODUCTION.value:
return model
return None
# Usage
registry = ModelRegistry(redis_client)
registry.register_model("claude-sonnet-v4-1", {
"provider": "anthropic",
"version": "claude-sonnet-4-20260512",
"parameters": {"temperature": 0.7, "max_tokens": 4096},
})
A/B Testing Framework
Compare model versions on live traffic with statistical significance:
import random
import hashlib
class ModelABTest:
def __init__(self, registry: ModelRegistry):
self.registry = registry
self.experiments = {}
def start_experiment(self, name: str, model_a: str, model_b: str, traffic_split: float = 0.5):
self.experiments[name] = {
"model_a": model_a,
"model_b": model_b,
"traffic_split": traffic_split,
"started_at": datetime.now().isoformat(),
"results": {"a": {"calls": 0, "errors": 0, "latency_ms": []},
"b": {"calls": 0, "errors": 0, "latency_ms": []}},
}
def select_model(self, experiment: str, user_id: str) -> str:
exp = self.experiments[experiment]
# Deterministic assignment based on user_id hash
hash_val = int(hashlib.md5(f"{experiment}:{user_id}".encode()).hexdigest(), 16)
if (hash_val % 1000) / 1000 < exp["traffic_split"]:
return exp["model_a"], "a"
return exp["model_b"], "b"
def record_result(self, experiment: str, variant: str, latency_ms: float, error: bool = False):
exp = self.experiments[experiment]
exp["results"][variant]["calls"] += 1
exp["results"][variant]["latency_ms"].append(latency_ms)
if error:
exp["results"][variant]["errors"] += 1
def get_winner(self, experiment: str) -> str | None:
exp = self.experiments[experiment]
results = exp["results"]
if results["a"]["calls"] < 100 or results["b"]["calls"] < 100:
return None # Not enough data
error_rate_a = results["a"]["errors"] / results["a"]["calls"]
error_rate_b = results["b"]["errors"] / results["b"]["calls"]
# Simple decision: lower error rate wins
if error_rate_a < error_rate_b:
return exp["model_a"]
return exp["model_b"]
Gradual Rollout
Deploy new models incrementally with automatic rollback:
class GradualR
Read the full article on AI Study Room for complete code examples, comparison tables, and related resources.
Found this useful? Check out more developer guides and tool comparisons on AI Study Room.
Top comments (0)