You've built an LLM-powered feature. It works in development. Users love the demo.
Then it goes to production.
Suddenly, you're facing questions you didn't consider: How much is this costing? Why did that response take 30 seconds? Did we just hit our rate limit? Why can't I reproduce this bug?
This final article covers the patterns that separate prototypes from production systems.
Observability: You Can't Fix What You Can't See
LLM calls are black boxes. You send tokens in, tokens come out. Without proper observability, debugging is guesswork.
OpenTelemetry Tracing
Instrument every LLM call with distributed tracing:
public class TracingChatClientMiddleware : DelegatingChatClient
{
private static readonly ActivitySource ActivitySource = new("AI.ChatClient");
private readonly string _serviceName;
public TracingChatClientMiddleware(IChatClient inner, string serviceName = "ai-service")
: base(inner)
{
_serviceName = serviceName;
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
using var activity = ActivitySource.StartActivity(
"chat.completion",
ActivityKind.Client);
if (activity == null)
return await base.CompleteAsync(chatMessages, options, cancellationToken);
// Request attributes
activity.SetTag("ai.service", _serviceName);
activity.SetTag("ai.model", options?.ModelId ?? "default");
activity.SetTag("ai.message_count", chatMessages.Count);
activity.SetTag("ai.has_tools", options?.Tools?.Any() ?? false);
// Calculate input tokens (approximate)
var inputText = string.Join("\n", chatMessages.Select(m => m.Text));
activity.SetTag("ai.input_length", inputText.Length);
var stopwatch = Stopwatch.StartNew();
try
{
var result = await base.CompleteAsync(chatMessages, options, cancellationToken);
stopwatch.Stop();
// Response attributes
activity.SetTag("ai.status", "success");
activity.SetTag("ai.finish_reason", result.FinishReason?.ToString());
activity.SetTag("ai.duration_ms", stopwatch.ElapsedMilliseconds);
if (result.Usage != null)
{
activity.SetTag("ai.input_tokens", result.Usage.InputTokenCount);
activity.SetTag("ai.output_tokens", result.Usage.OutputTokenCount);
activity.SetTag("ai.total_tokens", result.Usage.TotalTokenCount);
}
// Track function calls
var functionCalls = result.Message.Contents
.OfType<FunctionCallContent>()
.ToList();
if (functionCalls.Any())
{
activity.SetTag("ai.function_calls",
string.Join(",", functionCalls.Select(f => f.Name)));
}
return result;
}
catch (Exception ex)
{
stopwatch.Stop();
activity.SetTag("ai.status", "error");
activity.SetTag("ai.error_type", ex.GetType().Name);
activity.SetTag("ai.error_message", ex.Message);
activity.SetTag("ai.duration_ms", stopwatch.ElapsedMilliseconds);
activity.SetStatus(ActivityStatusCode.Error, ex.Message);
throw;
}
}
}
Registration with OpenTelemetry
builder.Services.AddOpenTelemetry()
.ConfigureResource(resource => resource
.AddService("my-ai-service"))
.WithTracing(tracing => tracing
.AddSource("AI.ChatClient")
.AddSource("AI.FunctionCalling")
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddOtlpExporter());
// Register the traced chat client
builder.Services.AddChatClient(sp =>
{
var inner = CreateChatClient(sp);
return inner
.AsBuilder()
.Use((client, _) => new TracingChatClientMiddleware(client))
.Build(sp);
});
Structured Logging
Complement tracing with structured logs:
public class LoggingChatClientMiddleware : DelegatingChatClient
{
private readonly ILogger<LoggingChatClientMiddleware> _logger;
public LoggingChatClientMiddleware(IChatClient inner, ILogger<LoggingChatClientMiddleware> logger)
: base(inner)
{
_logger = logger;
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
var correlationId = Activity.Current?.Id ?? Guid.NewGuid().ToString();
using var scope = _logger.BeginScope(new Dictionary<string, object>
{
["CorrelationId"] = correlationId,
["Model"] = options?.ModelId ?? "default",
["MessageCount"] = chatMessages.Count
});
_logger.LogDebug(
"Starting chat completion with {MessageCount} messages",
chatMessages.Count);
var stopwatch = Stopwatch.StartNew();
try
{
var result = await base.CompleteAsync(chatMessages, options, cancellationToken);
_logger.LogInformation(
"Chat completion succeeded in {DurationMs}ms. " +
"Tokens: {InputTokens} in, {OutputTokens} out",
stopwatch.ElapsedMilliseconds,
result.Usage?.InputTokenCount ?? 0,
result.Usage?.OutputTokenCount ?? 0);
return result;
}
catch (Exception ex)
{
_logger.LogError(ex,
"Chat completion failed after {DurationMs}ms",
stopwatch.ElapsedMilliseconds);
throw;
}
}
}
Testing LLM Applications
Testing non-deterministic systems requires different strategies than traditional unit tests.
Mock Providers for Unit Tests
public class MockChatClient : IChatClient
{
private readonly Queue<ChatCompletion> _responses = new();
private readonly List<IList<ChatMessage>> _receivedMessages = new();
public ChatClientMetadata Metadata => new("mock", new Uri("http://localhost"), "mock-model");
public void EnqueueResponse(string text)
{
var message = new ChatMessage(ChatRole.Assistant, text);
var completion = new ChatCompletion(message);
_responses.Enqueue(completion);
}
public void EnqueueFunctionCall(string name, object arguments)
{
var content = new FunctionCallContent(
Guid.NewGuid().ToString(),
name,
new Dictionary<string, object?>(
arguments.GetType()
.GetProperties()
.ToDictionary(p => p.Name, p => p.GetValue(arguments))));
var message = new ChatMessage(ChatRole.Assistant, new[] { content });
_responses.Enqueue(new ChatCompletion(message));
}
public Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
_receivedMessages.Add(chatMessages);
if (_responses.Count == 0)
throw new InvalidOperationException("No responses queued");
return Task.FromResult(_responses.Dequeue());
}
public IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
throw new NotImplementedException();
}
public void Dispose() { }
public TService? GetService<TService>(object? key = null) where TService : class => null;
// Assertions
public void AssertMessageContains(string text)
{
var allMessages = _receivedMessages.SelectMany(m => m).ToList();
Assert.Contains(allMessages, m => m.Text?.Contains(text) == true);
}
public void AssertSystemPromptContains(string text)
{
var systemMessages = _receivedMessages
.SelectMany(m => m)
.Where(m => m.Role == ChatRole.System);
Assert.Contains(systemMessages, m => m.Text?.Contains(text) == true);
}
}
Unit Tests with Mocks
public class ContentServiceTests
{
[Fact]
public async Task SummarizeAsync_ReturnsSummary()
{
// Arrange
var mockClient = new MockChatClient();
mockClient.EnqueueResponse("This is a brief summary of the content.");
var service = new ContentService(mockClient);
// Act
var result = await service.SummarizeAsync("Long content here...");
// Assert
Assert.Equal("This is a brief summary of the content.", result);
mockClient.AssertSystemPromptContains("summarizer");
}
[Fact]
public async Task ProcessOrder_CallsCorrectFunction()
{
// Arrange
var mockClient = new MockChatClient();
mockClient.EnqueueFunctionCall("get_order_status", new { orderId = "ORD-123" });
mockClient.EnqueueResponse("Your order ORD-123 is being shipped.");
var orderFunctions = new Mock<OrderFunctions>();
orderFunctions
.Setup(f => f.GetOrderStatusAsync("ORD-123"))
.ReturnsAsync(new OrderInfo { Status = "Shipped" });
var service = new FunctionCallingService(mockClient, orderFunctions.Object);
// Act
var result = await service.ProcessAsync("Where is my order ORD-123?");
// Assert
Assert.Contains("shipped", result.ToLower());
orderFunctions.Verify(f => f.GetOrderStatusAsync("ORD-123"), Times.Once);
}
}
Integration Tests with Real Models
For behavior testing, use real models but with controlled inputs:
public class IntegrationTests : IClassFixture<AITestFixture>
{
private readonly IChatClient _chatClient;
public IntegrationTests(AITestFixture fixture)
{
_chatClient = fixture.ChatClient;
}
[Fact]
[Trait("Category", "Integration")]
public async Task ExtractProductInfo_ReturnsValidStructure()
{
// Arrange
var review = """
I bought this laptop last month. The battery life is amazing -
easily lasts 10 hours. The keyboard is comfortable but the trackpad
is a bit small. Overall, great value for $899.
""";
var options = new ChatOptions
{
ResponseFormat = ChatResponseFormat.ForJsonSchema<ProductReview>()
};
// Act
var response = await _chatClient.CompleteAsync(
new ChatMessage(ChatRole.User, $"Extract product info:\n\n{review}"),
options);
var result = JsonSerializer.Deserialize<ProductReview>(response.Message.Text!);
// Assert
Assert.NotNull(result);
Assert.InRange(result.Rating, 1, 5);
Assert.NotEmpty(result.Pros);
Assert.NotEmpty(result.Cons);
Assert.True(result.Summary.Length > 10);
}
[Theory]
[InlineData("Hello", false)] // Not a support request
[InlineData("I want to return my order", true)] // Is a support request
[InlineData("What's the capital of France?", false)] // General question
[InlineData("My package never arrived", true)] // Support request
[Trait("Category", "Integration")]
public async Task ClassifyIntent_CorrectlyIdentifiesSupportRequests(
string input,
bool expectedIsSupport)
{
// Act
var response = await _chatClient.CompleteAsync(new[]
{
new ChatMessage(ChatRole.System,
"Respond with only 'SUPPORT' or 'OTHER' based on whether " +
"this is a customer support request."),
new ChatMessage(ChatRole.User, input)
});
var isSupport = response.Message.Text?.Trim() == "SUPPORT";
// Assert
Assert.Equal(expectedIsSupport, isSupport);
}
}
public class AITestFixture : IDisposable
{
public IChatClient ChatClient { get; }
public AITestFixture()
{
// Use a test-specific model or configuration
ChatClient = new OpenAIClient(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
.AsChatClient("gpt-4o-mini"); // Cheaper model for tests
}
public void Dispose()
{
(ChatClient as IDisposable)?.Dispose();
}
}
Evaluation Testing
For subjective quality, use LLM-as-judge:
public class EvaluationTests
{
private readonly IChatClient _chatClient;
private readonly IChatClient _evaluator;
[Fact]
public async Task SupportResponse_MeetsQualityStandards()
{
// Generate response
var response = await _chatClient.CompleteAsync(new[]
{
new ChatMessage(ChatRole.System, "You are a customer support agent."),
new ChatMessage(ChatRole.User, "My order is late and I'm frustrated!")
});
// Evaluate with LLM
var evaluation = await _evaluator.CompleteAsync(new[]
{
new ChatMessage(ChatRole.System, """
Evaluate this customer support response on these criteria:
1. Empathy (1-5): Does it acknowledge the customer's feelings?
2. Helpfulness (1-5): Does it offer concrete next steps?
3. Professionalism (1-5): Is the tone appropriate?
Respond in JSON: {"empathy": N, "helpfulness": N, "professionalism": N, "notes": "..."}
"""),
new ChatMessage(ChatRole.User, $"Response to evaluate:\n\n{response.Message.Text}")
}, new ChatOptions { ResponseFormat = ChatResponseFormat.Json });
var scores = JsonSerializer.Deserialize<EvaluationScores>(evaluation.Message.Text!);
// Assert minimum quality
Assert.True(scores.Empathy >= 3, $"Empathy too low: {scores.Notes}");
Assert.True(scores.Helpfulness >= 3, $"Helpfulness too low: {scores.Notes}");
Assert.True(scores.Professionalism >= 4, $"Professionalism too low: {scores.Notes}");
}
}
public record EvaluationScores(int Empathy, int Helpfulness, int Professionalism, string Notes);
Cost Management
LLM costs can explode without proper controls.
Cost Tracking Middleware
public class CostTrackingMiddleware : DelegatingChatClient
{
private readonly ICostCalculator _calculator;
private readonly IMetrics _metrics;
private readonly CostTrackingOptions _options;
public CostTrackingMiddleware(
IChatClient inner,
ICostCalculator calculator,
IMetrics metrics,
IOptions<CostTrackingOptions> options) : base(inner)
{
_calculator = calculator;
_metrics = metrics;
_options = options.Value;
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
var result = await base.CompleteAsync(chatMessages, options, cancellationToken);
if (result.Usage == null) return result;
var modelId = options?.ModelId ?? _options.DefaultModel;
var cost = _calculator.Calculate(
modelId,
result.Usage.InputTokenCount ?? 0,
result.Usage.OutputTokenCount ?? 0);
// Record metrics
_metrics.RecordCost(cost, new Dictionary<string, object>
{
["model"] = modelId,
["service"] = Activity.Current?.GetTagItem("service") ?? "unknown"
});
_metrics.RecordTokens(
result.Usage.InputTokenCount ?? 0,
result.Usage.OutputTokenCount ?? 0,
modelId);
return result;
}
}
public interface ICostCalculator
{
decimal Calculate(string model, int inputTokens, int outputTokens);
}
public class OpenAICostCalculator : ICostCalculator
{
// Prices per 1M tokens (as of 2024)
private readonly Dictionary<string, (decimal Input, decimal Output)> _prices = new()
{
["gpt-4o"] = (2.50m, 10.00m),
["gpt-4o-mini"] = (0.15m, 0.60m),
["gpt-4-turbo"] = (10.00m, 30.00m),
["gpt-3.5-turbo"] = (0.50m, 1.50m)
};
public decimal Calculate(string model, int inputTokens, int outputTokens)
{
if (!_prices.TryGetValue(model, out var prices))
prices = _prices["gpt-4o"]; // Default to most common
var inputCost = (inputTokens / 1_000_000m) * prices.Input;
var outputCost = (outputTokens / 1_000_000m) * prices.Output;
return inputCost + outputCost;
}
}
Budget Enforcement
public class BudgetEnforcementMiddleware : DelegatingChatClient
{
private readonly IBudgetService _budgetService;
private readonly ICostCalculator _calculator;
private readonly ILogger<BudgetEnforcementMiddleware> _logger;
public BudgetEnforcementMiddleware(
IChatClient inner,
IBudgetService budgetService,
ICostCalculator calculator,
ILogger<BudgetEnforcementMiddleware> logger) : base(inner)
{
_budgetService = budgetService;
_calculator = calculator;
_logger = logger;
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
var budgetKey = GetBudgetKey();
// Check budget before call
var budget = await _budgetService.GetBudgetAsync(budgetKey);
var spent = await _budgetService.GetSpentAsync(budgetKey);
if (spent >= budget.Limit)
{
_logger.LogWarning(
"Budget exceeded for {BudgetKey}. Limit: {Limit}, Spent: {Spent}",
budgetKey, budget.Limit, spent);
throw new BudgetExceededException(budgetKey, spent, budget.Limit);
}
// Warn if approaching limit
var percentUsed = (spent / budget.Limit) * 100;
if (percentUsed >= budget.WarnThreshold)
{
_logger.LogWarning(
"Budget warning for {BudgetKey}: {Percent}% used",
budgetKey, percentUsed);
}
// Make the call
var result = await base.CompleteAsync(chatMessages, options, cancellationToken);
// Record spend
if (result.Usage != null)
{
var cost = _calculator.Calculate(
options?.ModelId ?? "gpt-4o",
result.Usage.InputTokenCount ?? 0,
result.Usage.OutputTokenCount ?? 0);
await _budgetService.RecordSpendAsync(budgetKey, cost);
}
return result;
}
private string GetBudgetKey()
{
// Use tenant, user, or service as budget key
var userId = Activity.Current?.GetTagItem("user_id")?.ToString();
var tenantId = Activity.Current?.GetTagItem("tenant_id")?.ToString();
if (!string.IsNullOrEmpty(tenantId))
return $"tenant:{tenantId}";
if (!string.IsNullOrEmpty(userId))
return $"user:{userId}";
return "global";
}
}
public class BudgetExceededException : Exception
{
public string BudgetKey { get; }
public decimal Spent { get; }
public decimal Limit { get; }
public BudgetExceededException(string budgetKey, decimal spent, decimal limit)
: base($"Budget exceeded for {budgetKey}: {spent:C} of {limit:C}")
{
BudgetKey = budgetKey;
Spent = spent;
Limit = limit;
}
}
Resilience Patterns
LLM APIs fail. Rate limits happen. Build for it.
Retry with Exponential Backoff
public class RetryingChatClient : DelegatingChatClient
{
private readonly ILogger<RetryingChatClient> _logger;
private readonly RetryOptions _options;
public RetryingChatClient(
IChatClient inner,
ILogger<RetryingChatClient> logger,
IOptions<RetryOptions> options) : base(inner)
{
_logger = logger;
_options = options.Value;
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
var attempt = 0;
var delay = _options.InitialDelay;
while (true)
{
attempt++;
try
{
return await base.CompleteAsync(chatMessages, options, cancellationToken);
}
catch (Exception ex) when (ShouldRetry(ex) && attempt < _options.MaxAttempts)
{
_logger.LogWarning(ex,
"Attempt {Attempt} failed, retrying in {Delay}ms",
attempt, delay.TotalMilliseconds);
await Task.Delay(delay, cancellationToken);
delay = TimeSpan.FromMilliseconds(
Math.Min(delay.TotalMilliseconds * 2, _options.MaxDelay.TotalMilliseconds));
}
}
}
private bool ShouldRetry(Exception ex)
{
return ex switch
{
HttpRequestException { StatusCode: HttpStatusCode.TooManyRequests } => true,
HttpRequestException { StatusCode: HttpStatusCode.ServiceUnavailable } => true,
HttpRequestException { StatusCode: HttpStatusCode.BadGateway } => true,
HttpRequestException { StatusCode: HttpStatusCode.GatewayTimeout } => true,
TaskCanceledException => false, // Don't retry cancellations
_ => false
};
}
}
public class RetryOptions
{
public int MaxAttempts { get; set; } = 3;
public TimeSpan InitialDelay { get; set; } = TimeSpan.FromSeconds(1);
public TimeSpan MaxDelay { get; set; } = TimeSpan.FromSeconds(30);
}
Fallback to Alternative Providers
public class FallbackChatClient : IChatClient
{
private readonly IChatClient _primary;
private readonly IChatClient _fallback;
private readonly ILogger<FallbackChatClient> _logger;
public FallbackChatClient(
IChatClient primary,
IChatClient fallback,
ILogger<FallbackChatClient> logger)
{
_primary = primary;
_fallback = fallback;
_logger = logger;
}
public ChatClientMetadata Metadata => _primary.Metadata;
public async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
try
{
return await _primary.CompleteAsync(chatMessages, options, cancellationToken);
}
catch (Exception ex) when (ShouldFallback(ex))
{
_logger.LogWarning(ex,
"Primary provider failed, falling back to secondary");
return await _fallback.CompleteAsync(chatMessages, options, cancellationToken);
}
}
private bool ShouldFallback(Exception ex)
{
return ex switch
{
HttpRequestException { StatusCode: HttpStatusCode.TooManyRequests } => true,
HttpRequestException { StatusCode: HttpStatusCode.ServiceUnavailable } => true,
TaskCanceledException when !CancellationToken.None.IsCancellationRequested => true,
_ => false
};
}
// ... implement other interface members
}
Circuit Breaker
public class CircuitBreakerChatClient : DelegatingChatClient
{
private readonly CircuitBreakerPolicy _policy;
public CircuitBreakerChatClient(
IChatClient inner,
IOptions<CircuitBreakerOptions> options) : base(inner)
{
var opt = options.Value;
_policy = Policy
.Handle<HttpRequestException>()
.CircuitBreakerAsync(
exceptionsAllowedBeforeBreaking: opt.FailureThreshold,
durationOfBreak: opt.BreakDuration,
onBreak: (ex, duration) =>
{
// Log circuit opened
},
onReset: () =>
{
// Log circuit closed
});
}
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> chatMessages,
ChatOptions? options = null,
CancellationToken cancellationToken = default)
{
return await _policy.ExecuteAsync(async () =>
await base.CompleteAsync(chatMessages, options, cancellationToken));
}
}
Health Checks
Monitor your AI services:
public class ChatClientHealthCheck : IHealthCheck
{
private readonly IChatClient _chatClient;
private readonly ILogger<ChatClientHealthCheck> _logger;
public ChatClientHealthCheck(IChatClient chatClient, ILogger<ChatClientHealthCheck> logger)
{
_chatClient = chatClient;
_logger = logger;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
var stopwatch = Stopwatch.StartNew();
// Minimal completion to check connectivity
var response = await _chatClient.CompleteAsync(
new[] { new ChatMessage(ChatRole.User, "ping") },
new ChatOptions { MaxOutputTokens = 5 },
cancellationToken);
stopwatch.Stop();
var data = new Dictionary<string, object>
{
["latency_ms"] = stopwatch.ElapsedMilliseconds,
["model"] = _chatClient.Metadata.ModelId ?? "unknown"
};
if (stopwatch.ElapsedMilliseconds > 5000)
{
return HealthCheckResult.Degraded(
$"High latency: {stopwatch.ElapsedMilliseconds}ms",
data: data);
}
return HealthCheckResult.Healthy("Chat client responding", data);
}
catch (Exception ex)
{
_logger.LogError(ex, "Health check failed");
return HealthCheckResult.Unhealthy(
"Chat client unavailable",
ex,
new Dictionary<string, object>
{
["error"] = ex.Message
});
}
}
}
// Registration
builder.Services.AddHealthChecks()
.AddCheck<ChatClientHealthCheck>("ai-provider", tags: new[] { "ai", "ready" });
Production Checklist
Before going live, verify:
Observability
- [ ] OpenTelemetry tracing configured
- [ ] Structured logging with correlation IDs
- [ ] Dashboards for latency, errors, token usage
- [ ] Alerts for error rates and latency spikes
Cost Control
- [ ] Cost tracking per request
- [ ] Budget limits per tenant/user
- [ ] Alerts at 80% budget consumption
- [ ] Model selection based on task complexity
Resilience
- [ ] Retry logic with backoff
- [ ] Circuit breaker for cascading failures
- [ ] Fallback providers configured
- [ ] Timeout limits set
Security
- [ ] Input sanitization
- [ ] Output filtering for PII
- [ ] Session isolation verified
- [ ] API keys rotated and secured
Testing
- [ ] Unit tests with mocks
- [ ] Integration tests for critical paths
- [ ] Evaluation tests for quality
- [ ] Load tests for capacity planning
Conclusion
Building production AI applications is more than just API calls. It requires the same engineering rigor as any production system—observability, testing, cost management, and resilience.
The patterns in this series give you a foundation:
- Part 1: Provider abstraction for flexibility
- Part 2: Function calling for reliable actions
- Part 3: Conversation management for stateful interactions
- Part 4: Production patterns for real-world deployment
Start simple, iterate based on production feedback, and remember: the goal isn't perfect AI—it's useful AI that works reliably for your users.
This concludes the "Generative AI Patterns in C#" series. These patterns have been battle-tested in production systems. Apply them thoughtfully, and your AI applications will be ready for the real world.
Top comments (0)