Posted on Mar 23

Monitoring Your CAPTCHA Solver: Prometheus Metrics, Grafana Dashboards, and PagerDuty Alerts

Your scraper runs fine for three days. On day four, CAPTCHA solve rates drop from 95% to 40%. You don't notice until a stakeholder asks why the data pipeline is empty.

Sound familiar? Let's build observability into your CAPTCHA-solving infrastructure so you catch problems before they catch you.

What to Monitor

Before writing any code, decide which metrics actually matter:

Metric	Why It Matters
Solve success rate	Primary health indicator
Solve latency (p50, p95, p99)	Catches degradation early
Token expiry rate	Are you submitting too slow?
Cost per solve	Budget tracking
Concurrent solves	Capacity planning
Error breakdown by type	Root cause analysis

Step 1: Instrument Your Solver

Wrap your CAPTCHA solver with Prometheus metrics:

# metrics.py
from prometheus_client import (
    Counter, Histogram, Gauge, Summary
)

# Counters
captcha_solves_total = Counter(
    "captcha_solves_total",
    "Total CAPTCHA solve attempts",
    ["captcha_type", "status"]  # status: success/failed/expired
)

captcha_errors_total = Counter(
    "captcha_errors_total",
    "CAPTCHA errors by type",
    ["captcha_type", "error_type"]
)

# Histograms for latency
captcha_solve_duration = Histogram(
    "captcha_solve_duration_seconds",
    "Time to solve a CAPTCHA",
    ["captcha_type"],
    buckets=[1, 2, 5, 10, 15, 20, 30, 45, 60, 90]
)

# Gauges for current state
captcha_concurrent_solves = Gauge(
    "captcha_concurrent_solves",
    "Currently in-progress CAPTCHA solves",
    ["captcha_type"]
)

captcha_cost_dollars = Counter(
    "captcha_cost_dollars_total",
    "Total cost of CAPTCHA solves in USD",
    ["captcha_type"]
)

# Token freshness
captcha_token_age_seconds = Histogram(
    "captcha_token_age_at_submission_seconds",
    "Age of token when submitted to target site",
    buckets=[1, 5, 10, 30, 60, 90, 120]
)

Step 2: Instrumented Solver Wrapper

# solver_instrumented.py
import time
import httpx
from contextlib import contextmanager
from metrics import (
    captcha_solves_total,
    captcha_solve_duration,
    captcha_concurrent_solves,
    captcha_errors_total,
    captcha_cost_dollars,
)

# Cost per solve by type
COST_PER_SOLVE = {
    "recaptcha_v2": 0.001,
    "recaptcha_v3": 0.001,
    "hcaptcha": 0.001,
    "turnstile": 0.001,
    "funcaptcha": 0.002,
}

class InstrumentedSolver:
    def __init__(
        self, 
        api_base: str = "https://www.passxapi.com"
    ):
        self.api_base = api_base
        self.client = httpx.Client(timeout=90)

    def solve(
        self, captcha_type: str, 
        sitekey: str, url: str
    ) -> str:
        # Track concurrent solves
        captcha_concurrent_solves.labels(
            captcha_type=captcha_type
        ).inc()

        start = time.monotonic()

        try:
            token = self._do_solve(
                captcha_type, sitekey, url
            )

            # Record success
            duration = time.monotonic() - start
            captcha_solves_total.labels(
                captcha_type=captcha_type,
                status="success"
            ).inc()
            captcha_solve_duration.labels(
                captcha_type=captcha_type
            ).observe(duration)
            captcha_cost_dollars.labels(
                captcha_type=captcha_type
            ).inc(COST_PER_SOLVE.get(captcha_type, 0.001))

            return token

        except TimeoutError:
            captcha_solves_total.labels(
                captcha_type=captcha_type,
                status="failed"
            ).inc()
            captcha_errors_total.labels(
                captcha_type=captcha_type,
                error_type="timeout"
            ).inc()
            raise

        except Exception as e:
            error_type = type(e).__name__.lower()
            captcha_solves_total.labels(
                captcha_type=captcha_type,
                status="failed"
            ).inc()
            captcha_errors_total.labels(
                captcha_type=captcha_type,
                error_type=error_type
            ).inc()
            raise

        finally:
            captcha_concurrent_solves.labels(
                captcha_type=captcha_type
            ).dec()

    def _do_solve(self, captcha_type, sitekey, url):
        resp = self.client.post(
            f"{self.api_base}/api/v1/task",
            json={
                "type": captcha_type,
                "sitekey": sitekey,
                "pageurl": url
            }
        )
        task_id = resp.json()["task_id"]

        for _ in range(60):
            result = self.client.get(
                f"{self.api_base}/api/v1/task/{task_id}"
            ).json()
            if result["status"] == "completed":
                return result["token"]
            if result["status"] == "failed":
                raise Exception(result.get("error"))
            time.sleep(2)

        raise TimeoutError("Solve timed out at 120s")

Step 3: Expose Metrics Endpoint

Add a /metrics\ endpoint to your scraper:

# server.py
from prometheus_client import (
    start_http_server, 
    generate_latest
)
import threading

def start_metrics_server(port: int = 9090):
    """Start Prometheus metrics server in background."""
    start_http_server(port)
    print(f"Metrics available at http://localhost:{port}/metrics")

# In your main scraper
if __name__ == "__main__":
    start_metrics_server(9090)
    run_scraper()

Or if you're using Flask/FastAPI:

# FastAPI example
from fastapi import FastAPI
from prometheus_client import generate_latest
from starlette.responses import Response

app = FastAPI()

@app.get("/metrics")async def metrics():
    return Response(
        generate_latest(),
        media_type="text/plain; charset=utf-8"
    )

Step 4: Prometheus Configuration

# prometheus.yml
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: "captcha-solver"
    static_configs:
      - targets: ["scraper-host:9090"]
    scrape_interval: 10s

  # If running multiple scrapers
  - job_name: "captcha-solver-fleet"
    dns_sd_configs:
      - names: ["scraper.service.consul"]
        type: "SRV"

Step 5: Grafana Dashboard

Here are the key panels for your CAPTCHA monitoring dashboard:

Panel 1: Success Rate (should be > 90%)

# Success rate over last 5 minutes
sum(rate(
    captcha_solves_total{status="success"}[5m]
)) by (captcha_type) 
/ 
sum(rate(
    captcha_solves_total[5m]
)) by (captcha_type)

Panel 2: Solve Latency (p95)

histogram_quantile(0.95, 
    sum(rate(
        captcha_solve_duration_seconds_bucket[5m]
    )) by (le, captcha_type)
)

Panel 3: Error Breakdown

sum(rate(
    captcha_errors_total[5m]
)) by (captcha_type, error_type)

Panel 4: Cost Tracking

# Cost per hour by type
sum(rate(
    captcha_cost_dollars_total[1h]
)) by (captcha_type) * 3600

Panel 5: Concurrent Solves (capacity gauge)

captcha_concurrent_solves

Step 6: Alert Rules

Set up alerts before things break:

# alert_rules.yml
groups:
  - name: captcha_alerts
    rules:
      # Success rate drops below 80%
      - alert: CaptchaSolveRateLow
        expr: |
          sum(rate(captcha_solves_total{status="success"}[10m]))
          /
          sum(rate(captcha_solves_total[10m]))
          < 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "CAPTCHA solve rate below 80%"
          description: >
            Current rate: {{ $value | humanizePercentage }}

      # P95 latency above 30 seconds
      - alert: CaptchaSolveLatencyHigh
        expr: |
          histogram_quantile(0.95,
            sum(rate(
              captcha_solve_duration_seconds_bucket[5m]
            )) by (le)
          ) > 30
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "CAPTCHA solve p95 latency above 30s"

      # No solves happening at all
      - alert: CaptchaSolvesStopped
        expr: |
          sum(rate(captcha_solves_total[5m])) == 0
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "No CAPTCHA solves in the last 10 minutes"

      # Cost spike detection
      - alert: CaptchaCostSpike
        expr: |
          sum(rate(captcha_cost_dollars_total[1h])) * 3600
          > 5.0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "CAPTCHA cost exceeding $5/hour"

Step 7: Structured Logging

Metrics tell you what happened. Logs tell you why:

# logging_config.py
import structlog
import logging

def setup_logging():
    structlog.configure(
        processors=[
            structlog.stdlib.filter_by_level,
            structlog.stdlib.add_log_level,
            structlog.processors.TimeStamper(
                fmt="iso"
            ),
            structlog.processors.JSONRenderer()
        ],
        wrapper_class=structlog.stdlib.BoundLogger,
        logger_factory=structlog.stdlib.LoggerFactory(),
    )

logger = structlog.get_logger()

# In your solver
def solve(self, captcha_type, sitekey, url):
    log = logger.bind(
        captcha_type=captcha_type,
        target_url=url
    )

    log.info("captcha.solve.started")

    try:
        token = self._do_solve(captcha_type, sitekey, url)
        log.info(
            "captcha.solve.success",
            duration=duration,
            token_length=len(token)
        )
        return token
    except Exception as e:
        log.error(
            "captcha.solve.failed",
            error=str(e),
            error_type=type(e).__name__
        )
        raise

Output:

{"event": "captcha.solve.started", "captcha_type": "recaptcha_v2", "target_url": "https://example.com", "timestamp": "2025-03-23T10:15:00Z", "level": "info"}
{"event": "captcha.solve.success", "captcha_type": "recaptcha_v2", "duration": 4.2, "token_length": 580, "timestamp": "2025-03-23T10:15:04Z", "level": "info"}

Step 8: Health Check Endpoint

Add a health check that verifies your solver is working:

@app.get("/health")async def health():
    # Try a test solve
    try:
        test_sitekey = "6LeIxAcTAAAAAJcZVRqyHh71UMIEGNQ_MXjiZKhI"
        start = time.time()
        solver.solve(
            "recaptcha_v2", 
            test_sitekey, 
            "https://example.com"
        )
        latency = time.time() - start

        return {
            "status": "healthy",
            "solve_latency": f"{latency:.1f}s",
            "stats": solver.stats
        }
    except Exception as e:
        return JSONResponse(
            status_code=503,
            content={
                "status": "unhealthy",
                "error": str(e)
            }
        )

Quick Start: Minimal Setup

Don't want the full Prometheus/Grafana stack? Start with just logging:

# Minimal monitoring — just print stats every 60s
import asyncio

async def stats_reporter(solver, interval=60):
    while True:
        await asyncio.sleep(interval)
        stats = solver.stats
        rate = (
            stats["solved"] / 
            (stats["solved"] + stats["failed"]) * 100
            if (stats["solved"] + stats["failed"]) > 0 
            else 0
        )
        print(
            f"[CAPTCHA Stats] "
            f"Solved: {stats['solved']} | "
            f"Failed: {stats['failed']} | "
            f"Rate: {rate:.1f}% | "
            f"Avg latency: {stats.get('avg_latency', 0):.1f}s"
        )

Key Takeaways

Instrument from day one — adding metrics later is painful
Track success rate as your primary metric — everything else is secondary
Alert on rate drops, not individual failures — transient errors are normal
Separate concerns — metrics for dashboards, structured logs for debugging
Cost tracking prevents surprises — alert before the bill does

For the CAPTCHA-solving client used in these examples, check out passxapi-python. It provides clean interfaces that are easy to wrap with your own metrics layer.

What monitoring stack do you use for your scrapers? I'd love to hear what metrics you track.

DEV Community