Testing AI applications is fundamentally different from testing traditional software. There's no deterministic output, prompts change behavior, and edge cases multiply. Here's how to build a robust testing strategy for AI-powered applications.
The AI Testing Challenge
Traditional testing:
Input → Function → Expected Output
AI testing:
Input → Prompt + Context → Probabilistic Output
You can't assert exact outputs. Instead, you test properties.
Property-Based Testing for AI
`typescript
// Instead of testing exact output, test properties
interface TestCase {
input: string;
constraints: Constraint[];
}
interface Constraint {
type: 'contains' | 'excludes' | 'length' | 'format' | 'json';
value: string | number | RegExp;
}
async function testAIOutput(testCase: TestCase, actualOutput: string): Promise {
for (const constraint of testCase.constraints) {
switch (constraint.type) {
case 'contains':
if (!actualOutput.includes(constraint.value as string)) return false;
break;
case 'excludes':
if (actualOutput.includes(constraint.value as string)) return false;
break;
case 'length':
if (actualOutput.length > (constraint.value as number)) return false;
break;
case 'json':
try {
JSON.parse(actualOutput);
} catch {
return false;
}
break;
}
}
return true;
}
// Example test
const testCase: TestCase = {
input: 'Extract the name and email from: John Doe, john@example.com',
constraints: [
{ type: 'contains', value: 'John' },
{ type: 'contains', value: 'john@example.com' },
{ type: 'excludes', value: 'undefined' },
{ type: 'length', value: 100 }
]
};
`
Prompt Versioning and Regression Testing
`python
import hashlib
from datetime import datetime
class PromptRegistry:
def init(self):
self.prompts = {}
def register(self, name: str, version: str, prompt: str, test_cases: list):
key = f"{name}:{version}"
self.prompts[key] = {
'prompt': prompt,
'testcases': testcases,
'hash': hashlib.md5(prompt.encode()).hexdigest(),
'registered': datetime.now()
}
def get_prompt(self, name: str, version: str) -> str:
return self.prompts[f"{name}:{version}"]['prompt']
def regressiontest(self, name: str, newversion: str,
llm_client, threshold: float = 0.8) -> bool:
"""Ensure new version passes existing test cases."""
old_prompt = self.prompts.get(f"{name}:{version}")
if not old_prompt:
return True
old_passes = 0
new_passes = 0
for tc in oldprompt['testcases']:
oldresult = await llmclient.complete(old_prompt['prompt'] + tc['input'])
newresult = await llmclient.complete(
self.getprompt(name, newversion) + tc['input']
)
oldok = await testAIOutput(tc, oldresult)
newok = await testAIOutput(tc, newresult)
if oldok: oldpasses += 1
if newok: newpasses += 1
New version should pass at least as many tests
return (newpasses / len(oldprompt['test_cases'])) >= threshold
`
Deterministic Output Testing
For structured outputs, test deterministically:
`typescript
import { z } from 'zod';
const CodeReviewSchema = z.object({
score: z.number().min(0).max(10),
issues: z.array(z.object({
severity: z.enum(['low', 'medium', 'high']),
line: z.number(),
description: z.string()
})),
summary: z.string()
});
async function testCodeReview(code: string, expectedScoreRange: [number, number]) {
const response = await llm.complete(
Review this code and return JSON: ${code}
);
// Parse and validate
const parsed = JSON.parse(response);
const validated = CodeReviewSchema.parse(parsed);
// Deterministic assertions
console.assert(
validated.score >= expectedScoreRange[0] &&
validated.score <= expectedScoreRange[1],
Score ${validated.score} outside expected range
);
console.assert(
validated.issues.length < 20,
'Too many issues reported'
);
return validated;
}
`
Mocking External AI Calls
`typescript
// For unit tests, mock the LLM client
class MockLLMClient {
constructor(private fixtures: Map) {}
async complete(prompt: string): Promise {
// Return fixture matching prompt pattern
for (const [pattern, response] of this.fixtures) {
if (prompt.includes(pattern)) {
return response;
}
}
return 'Mock response';
}
async *stream(prompt: string): AsyncGenerator {
const response = await this.complete(prompt);
for (const char of response) {
yield char;
}
}
}
// Usage in tests
const mockClient = new MockLLMClient(new Map([
['extract email', '{"email": "test@example.com"}'],
['summarize', 'This is a summary of the text.']
]));
// Now your business logic tests run fast and deterministically
`
Chaos Testing for AI Applications
`python
class AIChaosTests:
def testratelimits(self, client):
"""Does your app handle rate limits gracefully?"""
for _ in range(100):
try:
client.complete("test")
except RateLimitError:
assert client.retry_count > 0
break
else:
pytest.fail("Rate limit not encountered after 100 requests")
def testinvalidjson(self, client):
"""Does your app handle malformed JSON from LLM?"""
Inject bad response
client.mock_response('{"broken": }')
result = safeparsejson(client.complete("test"))
assert result is not None # Handled gracefully
def testemptycontext(self, client):
"""Does your app handle empty context?"""
result = client.complete("")
assert result is not None
def testmaxtokens_respected(self, client):
"""Does max_tokens actually limit output?"""
result = client.complete("test", max_tokens=10)
assert len(result) <= 50 # ~10 tokens
`
Integration Test Framework
`typescript
describe('AI Integration Tests', () => {
const client = new ClaudeClient(process.env.OFOXAPIKEY);
describe('Code Review Feature', () => {
it('identifies syntax errors', async () => {
const code = 'const x = ;';
const review = await reviewCode(client, code);
expect(review.issues.some(i => i.severity === 'high')).toBe(true);
});
it('handles valid code gracefully', async () => {
const code = 'const x = 42;';
const review = await reviewCode(client, code);
expect(review.issues.filter(i => i.severity === 'high')).toHaveLength(0);
});
it('respects max issues limit', async () => {
const code = '...'; // Large code
const review = await reviewCode(client, code, { maxIssues: 10 });
expect(review.issues.length).toBeLessThanOrEqual(10);
});
});
});
`
Building Testable AI Systems
- Separate concerns — Keep prompts in config, not buried in code
- Structured outputs — Use Zod/JSON Schema to constrain responses
- Fallback handling — Plan for API failures at every call site
- Snapshot testing — Store expected responses for regression
Getting Started
Build testable AI applications with ofox.ai — their API is reliable and consistent, making it easier to build deterministic test suites.
👉 Get started with ofox.ai
This article contains affiliate links.
Tags: testing,ai,programming,developer,quality
Canonical URL: https://dev.to/zny10289
Top comments (0)