AI Monitoring and Alerting: Latency, Token Usage, Error Rates, Drift Detection

#ai #machinelearning #llm

This article was originally published on AI Study Room. For the full version with working code examples and related articles, visit the original post.

AI Monitoring and Alerting: Latency, Token Usage, Error Rates, Drift Detection

Introduction

AI applications introduce new monitoring dimensions beyond traditional infrastructure metrics. LLM responses can be slow, expensive, incorrect, or suddenly change behavior when providers update models. This article covers the metrics, tools, and alerting strategies for production AI monitoring, including drift detection that catches quality degradation before users complain.

Core Metrics

Every AI application should track these foundational metrics:

from prometheus_client import Counter, Histogram, Gauge

import time

class AIMetrics:

    def __init__(self):

        self.request_count = Counter(

            "llm_requests_total", "Total LLM requests",

            ["provider", "model", "status"],

        )

        self.latency = Histogram(

            "llm_latency_ms", "LLM response latency",

            ["provider", "model"],

            buckets=[100, 250, 500, 1000, 2000, 5000, 10000],

        )

        self.token_usage = Counter(

            "llm_tokens_total", "Token usage",

            ["provider", "model", "token_type"],  # token_type: input/output

        )

        self.cost_total = Counter(

            "llm_cost_usd", "Total cost in USD",

            ["provider", "model"],

        )

        self.cache_hit_ratio = Gauge(

            "llm_cache_hit_ratio", "Cache hit ratio",

            ["cache_level"],  # cache_level: exact/semantic

        )

    def record_request(self, provider: str, model: str, duration_ms: float, status: str = "success"):

        self.request_count.labels(provider=provider, model=model, status=status).inc()

        self.latency.labels(provider=provider, model=model).observe(duration_ms)

    def record_tokens(self, provider: str, model: str, input_tokens: int, output_tokens: int, cost: float):

        self.token_usage.labels(provider=provider, model=model, token_type="input").inc(input_tokens)

        self.token_usage.labels(provider=provider, model=model, token_type="output").inc(output_tokens)

        self.cost_total.labels(provider=provider, model=model).inc(cost)

Token Usage Tracking

Monitor token consumption per user, feature, and time period:

class TokenUsageTracker:

    def __init__(self, db):

        self.db = db

    async def log_usage(self, user_id: str, feature: str, provider: str, model: str,

                        input_tokens: int, output_tokens: int, latency_ms: float):

        await self.db.execute("""

            INSERT INTO token_usage

            (user_id, feature, provider, model, input_tokens, output_tokens,

             latency_ms, cost, timestamp)

            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, NOW())

        """, user_id, feature, provider, model, input_tokens, output_tokens,

            latency_ms, self._calculate_cost(input_tokens, output_tokens, provider, model))

    async def get_daily_usage(self, date: str) -> dict:

        row = await self.db.fetchrow("""

            SELECT SUM(input_tokens) as input, SUM(output_tokens) as output,

                   SUM(cost) as cost, COUNT(*) as requests

            FROM token_usage WHERE DATE(timestamp) = $1

        """, date)

        return dict(row) if row else {"input": 0, "output": 0, "cost": 0, "requests": 0}

    async def check_budget(self, user_id: str, daily_budget: float) -> bool:

        row = await self.db.fetchrow("""

            SELECT SUM(cost) as today_cost

            FROM token_usage

            WHERE user_id = $1 AND DATE(timestamp) = CURRENT_DATE

        """, user_id)

        return (row["today_cost"] or 0) < daily_budget

Error Rate Monitoring

LLM applications experience distinctive error types:

from enum import Enum

class LLMErrorType(Enum):

    RATE_LIMIT = "rate_limit"

    CONTEXT_WINDOW = "context_window_exceeded"

    CONTENT_FILTER = "content_filter_blocked"

    TIMEOUT = "timeout"

    INVALID_RESPONSE = "invalid_response"

    PROVIDER_DOWN = "provider_down"

class ErrorMonitor:

    def __init__(self, alert_threshold: float = 0.05):

        self.alert_threshold = alert_threshold

        self.error_counts: dict[str, int] = {}

        self.total_requests: int = 0

    def record_error(self, error_type: LLMErrorType, provider: str):

        key = f"{error_type.value}:{provider}"

        self.error_counts[key] = self.error_counts.get(key, 0) + 1

        self.total_requests += 1

    def check_alerts(self) -> list[str]:

        alerts = []

        for key, count in self.error_counts.items():

            rate = count / max(self.total_requests, 1)

            if rate > self.alert_threshold:

Read the full article on AI Study Room for complete code examples, comparison tables, and related resources.

Found this useful? Check out more developer guides and tool comparisons on AI Study Room.