I have reviewed a lot of AI systems that are running in production with no version control on their prompts. The prompts live in environment variables, in config files checked into main, in database rows with no history, or hardcoded into application logic. When something goes wrong, there is no way to know what the prompt looked like before the last change, who changed it, or why.
This is the equivalent of running a production database with no schema migration history. It works until something breaks and then you have no recovery path.
Here is the prompt versioning system I implement for production AI systems.
The core data model
A prompt is not a string. It is a versioned artifact with a deployment history.
from dataclasses import dataclass
from typing import Optional
from datetime import datetime
@dataclass
class PromptVersion:
prompt_id: str # stable identifier, e.g. "hr_query_system_prompt"
version: str # semver: "1.0.0", "1.1.0", "2.0.0"
content: str # the actual prompt text
description: str # what changed and why
author: str # who created this version
created_at: datetime
is_active: bool
tested: bool # has this version passed evaluation tests
evaluation_score: Optional[float] # score from your eval suite
@dataclass
class PromptDeployment:
prompt_id: str
version: str
environment: str # "production", "staging", "development"
deployed_at: datetime
deployed_by: str
previous_version: Optional[str]
Every change to a prompt is a new version. Deployment to an environment is a separate record. You can always see what is running where and roll back to any previous version.
The registry
class PromptRegistry:
def __init__(self, db_connection):
self.db = db_connection
def register(self, prompt: PromptVersion) -> str:
# Reject if this version already exists
existing = self.db.get_version(prompt.prompt_id, prompt.version)
if existing:
raise ValueError(f"Version {prompt.version} of {prompt.prompt_id} already exists")
self.db.insert_prompt_version(prompt)
return prompt.version
def deploy(self, prompt_id: str, version: str, environment: str, deployed_by: str):
prompt = self.db.get_version(prompt_id, version)
if not prompt:
raise ValueError(f"Version {version} of {prompt_id} not found")
if not prompt.tested:
raise ValueError(f"Version {version} has not passed evaluation tests")
# Record previous active version for rollback
current = self.get_active(prompt_id, environment)
previous_version = current.version if current else None
# Deactivate current version in this environment
if current:
self.db.deactivate_deployment(prompt_id, environment)
# Deploy new version
deployment = PromptDeployment(
prompt_id=prompt_id,
version=version,
environment=environment,
deployed_at=datetime.now(),
deployed_by=deployed_by,
previous_version=previous_version
)
self.db.insert_deployment(deployment)
def rollback(self, prompt_id: str, environment: str, rolled_back_by: str):
current_deployment = self.db.get_active_deployment(prompt_id, environment)
if not current_deployment or not current_deployment.previous_version:
raise ValueError("No previous version to roll back to")
self.deploy(
prompt_id=prompt_id,
version=current_deployment.previous_version,
environment=environment,
deployed_by=rolled_back_by
)
def get_active(self, prompt_id: str, environment: str) -> Optional[PromptVersion]:
deployment = self.db.get_active_deployment(prompt_id, environment)
if not deployment:
return None
return self.db.get_version(prompt_id, deployment.version)
The deploy method enforces that only tested versions can go to production. The rollback method is a first-class operation, not an emergency workaround.
Using the registry in your application
registry = PromptRegistry(db_connection)
def build_prompt(user_query: str, retrieved_context: str) -> str:
system_prompt_template = registry.get_active(
"hr_query_system_prompt",
environment=os.getenv("APP_ENV", "production")
)
if not system_prompt_template:
raise RuntimeError("No active system prompt found for environment")
return system_prompt_template.content.format(
context=retrieved_context,
query=user_query
)
Your application never hardcodes a prompt. It always fetches the active version for its environment. When you deploy a new prompt version, the application picks it up without a code deployment.
The evaluation gate
The tested flag on PromptVersion is only set after the version has passed your evaluation suite. This prevents untested prompts from being deployed to production regardless of who is doing the deploying.
def run_evaluation_and_mark(registry, prompt_id, version, eval_suite):
prompt = registry.db.get_version(prompt_id, version)
score = eval_suite.run(prompt.content)
if score >= eval_suite.passing_threshold:
registry.db.mark_tested(prompt_id, version, score)
print(f"Version {version} passed evaluation with score {score:.3f}")
else:
print(f"Version {version} failed evaluation with score {score:.3f}. Deployment blocked.")
This is the gate that stops a well-intentioned prompt edit from degrading production quality. You can tweak prompts as much as you want. They do not reach production until they pass the tests.
The whole system takes a day to build properly. The alternative is debugging production regressions with no history of what changed. I have done both. The day spent building the registry is worth it every time.
Top comments (0)