Prometheus Has a Free API: Here's How to Use It for Metrics Automation

#prometheus #monitoring #api #devops

Prometheus is the industry standard for metrics collection, and its HTTP API is completely free to use. You can query metrics, manage alerts, check targets, and build custom monitoring tools — all via simple HTTP calls.

Why Use the Prometheus API?

Query any metric with PromQL via HTTP
Build custom dashboards without Grafana
Automate alert management and silence rules
Monitor scrape targets and their health status

Getting Started

Prometheus API runs on port 9090 by default:

# Instant query
curl -s "http://localhost:9090/api/v1/query?query=up" | jq '.data.result[] | {instance: .metric.instance, job: .metric.job, value: .value[1]}'

# Range query (last hour, 15s steps)
curl -s "http://localhost:9090/api/v1/query_range?query=rate(http_requests_total[5m])&start=$(date -d '1 hour ago' +%s)&end=$(date +%s)&step=15" | jq '.data.result[0].values | length'

Python Client

import requests
from datetime import datetime, timedelta

PROM_URL = "http://localhost:9090"

def query_instant(promql):
    resp = requests.get(f"{PROM_URL}/api/v1/query", params={"query": promql})
    return resp.json()["data"]["result"]

def query_range(promql, hours=1, step="15s"):
    end = datetime.now()
    start = end - timedelta(hours=hours)
    resp = requests.get(f"{PROM_URL}/api/v1/query_range", params={
        "query": promql,
        "start": start.timestamp(),
        "end": end.timestamp(),
        "step": step
    })
    return resp.json()["data"]["result"]

# Get all up/down targets
targets = query_instant("up")
for t in targets:
    status = "UP" if t["value"][1] == "1" else "DOWN"
    print(f"{t['metric']['job']:20s} {t['metric']['instance']:30s} {status}")

Build a Health Dashboard

def service_health_report():
    metrics = {
        "Request Rate": "sum(rate(http_requests_total[5m]))",
        "Error Rate": "sum(rate(http_requests_total{status=~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100",
        "P99 Latency (ms)": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) * 1000",
        "Active Connections": "sum(http_connections_active)",
        "CPU Usage (%)": "rate(process_cpu_seconds_total[5m]) * 100",
        "Memory (MB)": "process_resident_memory_bytes / 1024 / 1024"
    }

    print("=== Service Health Report ===")
    for name, query in metrics.items():
        result = query_instant(query)
        if result:
            value = float(result[0]["value"][1])
            print(f"{name:25s}: {value:.2f}")
        else:
            print(f"{name:25s}: N/A")

service_health_report()

Manage Alerting Rules

# Get all active alerts
curl -s "http://localhost:9090/api/v1/alerts" | jq '.data.alerts[] | {name: .labels.alertname, state: .state, severity: .labels.severity}'

# Get all alerting rules
curl -s "http://localhost:9090/api/v1/rules" | jq '.data.groups[] | {group: .name, rules: [.rules[] | {alert: .name, state: .state, query: .query}]}'

Check Scrape Targets

def check_targets():
    resp = requests.get(f"{PROM_URL}/api/v1/targets")
    targets = resp.json()["data"]["activeTargets"]

    healthy = sum(1 for t in targets if t["health"] == "up")
    unhealthy = [t for t in targets if t["health"] != "up"]

    print(f"Targets: {healthy}/{len(targets)} healthy")

    if unhealthy:
        print("\nUnhealthy targets:")
        for t in unhealthy:
            print(f"  {t['labels']['job']:20s} {t['scrapeUrl']:40s} Error: {t.get('lastError', 'unknown')}")

check_targets()

Automated SLO Monitoring

def check_slo(service, target_availability=99.9, window_hours=24):
    # Calculate error budget
    query = f'1 - (sum(rate(http_requests_total{{job="{service}",status=~"5.."}}[{window_hours}h])) / sum(rate(http_requests_total{{job="{service}"}}[{window_hours}h])))'
    result = query_instant(query)

    if result:
        availability = float(result[0]["value"][1]) * 100
        budget_remaining = availability - target_availability

        print(f"Service: {service}")
        print(f"  Availability: {availability:.3f}%")
        print(f"  Target: {target_availability}%")
        print(f"  Error Budget: {budget_remaining:+.3f}%")

        if budget_remaining < 0:
            print(f"  STATUS: SLO VIOLATED!")
        elif budget_remaining < 0.01:
            print(f"  STATUS: Warning - budget almost exhausted")
        else:
            print(f"  STATUS: Healthy")

check_slo("api-server")
check_slo("payment-service", target_availability=99.99)

Real-World Use Case

A fintech team built an automated SLO tracker using the Prometheus API. Every hour, it calculated error budgets for 50+ services and posted a summary to Slack. When a service burned through 50% of its error budget, it automatically created a Jira ticket. Result: mean time to detection dropped from 45 minutes to 3 minutes.