DEV.TO ARTICLE 39: Testing AI-Powered Applications: Strategies for LLM Integration

Target Keyword: "testing ai applications llm integration"
Tags: testing,ai,programming,developer,quality
Type: Guide

Content

Testing AI-Powered Applications: Strategies for LLM Integration

Testing AI applications is fundamentally different from testing traditional software. There's no deterministic output, prompts change behavior, and edge cases multiply. Here's how to build a robust testing strategy for AI-powered applications.

The AI Testing Challenge

Traditional testing:

Input → Function → Expected Output

AI testing:

Input → Prompt + Context → Probabilistic Output

You can't assert exact outputs. Instead, you test properties.

Property-Based Testing for AI

// Instead of testing exact output, test properties

interface TestCase {
  input: string;
  constraints: Constraint[];
}

interface Constraint {
  type: 'contains' | 'excludes' | 'length' | 'format' | 'json';
  value: string | number | RegExp;
}

async function testAIOutput(testCase: TestCase, actualOutput: string): Promise<boolean> {
  for (const constraint of testCase.constraints) {
    switch (constraint.type) {
      case 'contains':
        if (!actualOutput.includes(constraint.value as string)) return false;
        break;
      case 'excludes':
        if (actualOutput.includes(constraint.value as string)) return false;
        break;
      case 'length':
        if (actualOutput.length > (constraint.value as number)) return false;
        break;
      case 'json':
        try {
          JSON.parse(actualOutput);
        } catch {
          return false;
        }
        break;
    }
  }
  return true;
}

// Example test
const testCase: TestCase = {
  input: 'Extract the name and email from: John Doe, john@example.com',
  constraints: [
    { type: 'contains', value: 'John' },
    { type: 'contains', value: 'john@example.com' },
    { type: 'excludes', value: 'undefined' },
    { type: 'length', value: 100 }
  ]
};

Prompt Versioning and Regression Testing

import hashlib
from datetime import datetime

class PromptRegistry:
    def __init__(self):
        self.prompts = {}

    def register(self, name: str, version: str, prompt: str, test_cases: list):
        key = f"{name}:{version}"
        self.prompts[key] = {
            'prompt': prompt,
            'test_cases': test_cases,
            'hash': hashlib.md5(prompt.encode()).hexdigest(),
            'registered': datetime.now()
        }

    def get_prompt(self, name: str, version: str) -> str:
        return self.prompts[f"{name}:{version}"]['prompt']

    def regression_test(self, name: str, new_version: str, 
                        llm_client, threshold: float = 0.8) -> bool:
        """Ensure new version passes existing test cases."""
        old_prompt = self.prompts.get(f"{name}:{version}")
        if not old_prompt:
            return True

        old_passes = 0
        new_passes = 0

        for tc in old_prompt['test_cases']:
            old_result = await llm_client.complete(old_prompt['prompt'] + tc['input'])
            new_result = await llm_client.complete(
                self.get_prompt(name, new_version) + tc['input']
            )

            old_ok = await testAIOutput(tc, old_result)
            new_ok = await testAIOutput(tc, new_result)

            if old_ok: old_passes += 1
            if new_ok: new_passes += 1

        # New version should pass at least as many tests
        return (new_passes / len(old_prompt['test_cases'])) >= threshold

Deterministic Output Testing

For structured outputs, test deterministically:

import { z } from 'zod';

const CodeReviewSchema = z.object({
  score: z.number().min(0).max(10),
  issues: z.array(z.object({
    severity: z.enum(['low', 'medium', 'high']),
    line: z.number(),
    description: "z.string()"
  })),
  summary: z.string()
});

async function testCodeReview(code: string, expectedScoreRange: [number, number]) {
  const response = await llm.complete(
    `Review this code and return JSON: ${code}`
  );

  // Parse and validate
  const parsed = JSON.parse(response);
  const validated = CodeReviewSchema.parse(parsed);

  // Deterministic assertions
  console.assert(
    validated.score >= expectedScoreRange[0] && 
    validated.score <= expectedScoreRange[1],
    `Score ${validated.score} outside expected range`
  );

  console.assert(
    validated.issues.length < 20,
    'Too many issues reported'
  );

  return validated;
}

Mocking External AI Calls

// For unit tests, mock the LLM client
class MockLLMClient {
  constructor(private fixtures: Map<string, string>) {}

  async complete(prompt: string): Promise<string> {
    // Return fixture matching prompt pattern
    for (const [pattern, response] of this.fixtures) {
      if (prompt.includes(pattern)) {
        return response;
      }
    }
    return 'Mock response';
  }

  async *stream(prompt: string): AsyncGenerator<string> {
    const response = await this.complete(prompt);
    for (const char of response) {
      yield char;
    }
  }
}

// Usage in tests
const mockClient = new MockLLMClient(new Map([
  ['extract email', '{"email": "test@example.com"}'],
  ['summarize', 'This is a summary of the text.']
]));

// Now your business logic tests run fast and deterministically

Chaos Testing for AI Applications

class AIChaosTests:
    def test_rate_limits(self, client):
        """Does your app handle rate limits gracefully?"""
        for _ in range(100):
            try:
                client.complete("test")
            except RateLimitError:
                assert client.retry_count > 0
                break
        else:
            pytest.fail("Rate limit not encountered after 100 requests")

    def test_invalid_json(self, client):
        """Does your app handle malformed JSON from LLM?"""
        # Inject bad response
        client.mock_response('{"broken": }')
        result = safe_parse_json(client.complete("test"))
        assert result is not None  # Handled gracefully

    def test_empty_context(self, client):
        """Does your app handle empty context?"""
        result = client.complete("")
        assert result is not None

    def test_max_tokens_respected(self, client):
        """Does max_tokens actually limit output?"""
        result = client.complete("test", max_tokens=10)
        assert len(result) <= 50  # ~10 tokens

Integration Test Framework

describe('AI Integration Tests', () => {
  const client = new ClaudeClient(process.env.OFOX_API_KEY);

  describe('Code Review Feature', () => {
    it('identifies syntax errors', async () => {
      const code = 'const x = ;';
      const review = await reviewCode(client, code);
      expect(review.issues.some(i => i.severity === 'high')).toBe(true);
    });

    it('handles valid code gracefully', async () => {
      const code = 'const x = 42;';
      const review = await reviewCode(client, code);
      expect(review.issues.filter(i => i.severity === 'high')).toHaveLength(0);
    });

    it('respects max issues limit', async () => {
      const code = '...'; // Large code
      const review = await reviewCode(client, code, { maxIssues: 10 });
      expect(review.issues.length).toBeLessThanOrEqual(10);
    });
  });
});