Monitoring and Alerting for Video Platform Infrastructure with Go

#go #monitoring #devops #infrastructure

Why Roll Your Own Monitoring?

Managed monitoring like Datadog is excellent — and expensive. For a self-hosted video platform like TopVideoHub, a 200-line Go binary can cover 90% of what you need: uptime checks, latency histograms, error rate alerts, and a Grafana dashboard wired to Prometheus.

Health Check Endpoints

package main

import (
    "database/sql"
    "encoding/json"
    "net/http"
    "time"
)

type HealthStatus struct {
    Status  string            `json:"status"`
    Checks  map[string]string `json:"checks"`
    Uptime  string            `json:"uptime"`
    Ts      time.Time         `json:"ts"`
}

var startTime = time.Now()

func healthHandler(db *sql.DB) http.HandlerFunc {
    return func(w http.ResponseWriter, r *http.Request) {
        checks := map[string]string{}
        overall := "ok"

        if err := db.Ping(); err != nil {
            checks["database"] = "error: " + err.Error()
            overall = "degraded"
        } else {
            checks["database"] = "ok"
        }

        status := HealthStatus{
            Status: overall,
            Checks: checks,
            Uptime: time.Since(startTime).Round(time.Second).String(),
            Ts:     time.Now(),
        }

        w.Header().Set("Content-Type", "application/json")
        if overall != "ok" {
            w.WriteHeader(http.StatusServiceUnavailable)
        }
        json.NewEncoder(w).Encode(status)
    }
}

Prometheus Metrics Exporter

import "github.com/prometheus/client_golang/prometheus"

var (
    videosFetched = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "topvideohub_videos_fetched_total",
        Help: "Total videos fetched.",
    }, []string{"region"})

    fetchDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
        Name:    "topvideohub_fetch_duration_seconds",
        Help:    "Fetch duration per region.",
        Buckets: prometheus.DefBuckets,
    }, []string{"region"})

    regionFreshness = prometheus.NewGaugeVec(prometheus.GaugeOpts{
        Name: "topvideohub_region_freshness_seconds",
        Help: "Seconds since last successful fetch per region.",
    }, []string{"region"})
)

func init() {
    prometheus.MustRegister(videosFetched, fetchDuration, regionFreshness)
}

Instrument the fetch loop:

func fetchWithMetrics(region string, fn func() (int, error)) {
    start := time.Now()
    count, err := fn()
    duration := time.Since(start).Seconds()

    fetchDuration.WithLabelValues(region).Observe(duration)
    if err == nil {
        videosFetched.WithLabelValues(region).Add(float64(count))
    }
}

Uptime Monitor

type UptimeMonitor struct {
    client *http.Client
    slack  string
}

func (m *UptimeMonitor) Watch(targets []string, interval time.Duration) {
    for _, target := range targets {
        go func(t string) {
            ticker := time.NewTicker(interval)
            for range ticker.C {
                start := time.Now()
                resp, err := m.client.Get(t)
                latency := time.Since(start)

                status := 0
                if resp != nil {
                    status = resp.StatusCode
                    resp.Body.Close()
                }
                if err != nil || status >= 500 {
                    msg := fmt.Sprintf(":red_circle: %s DOWN (HTTP %d, %v)", t, status, latency)
                    sendSlackWebhook(m.slack, msg)
                }
            }
        }(target)
    }
}

Watch all regional URLs for TopVideoHub:

monitor := &UptimeMonitor{client: &http.Client{Timeout: 10 * time.Second}, slack: os.Getenv("SLACK_WEBHOOK")}
monitor.Watch([]string{
    "https://topvideohub.com/health",
    "https://topvideohub.com/?region=JP",
    "https://topvideohub.com/?region=KR",
    "https://topvideohub.com/?region=TW",
}, 60*time.Second)

Grafana Dashboard

prometheus.yml:

scrape_configs:
  - job_name: 'topvideohub'
    static_configs:
      - targets: ['localhost:9090']
    scrape_interval: 30s

Key Grafana panels:

Panel	Query
Videos fetched / region	`rate(topvideohub_videos_fetched_total[5m])`
P95 fetch latency	`histogram_quantile(0.95, rate(topvideohub_fetch_duration_seconds_bucket[5m]))`
Region freshness	`topvideohub_region_freshness_seconds` (alert if > 28800)