Monitoring and Alerting for Video Platform Infrastructure with Go

#go #monitoring #devops #infrastructure

Why Custom Monitoring

Off-the-shelf monitoring tools are powerful but heavy. For a video platform like DailyWatch running on shared LiteSpeed hosting, we needed a lightweight Go binary that checks endpoint health, tracks uptime, and exposes Prometheus metrics -- without installing a full monitoring stack.

Health Check Configuration

Define what to monitor:

package monitor

import "time"

type CheckConfig struct {
    Name        string        `json:"name"`
    URL         string        `json:"url"`
    Method      string        `json:"method"`
    Interval    time.Duration `json:"interval"`
    Timeout     time.Duration `json:"timeout"`
    ExpectCode  int           `json:"expect_code"`
    ExpectBody  string        `json:"expect_body"`
}

type CheckResult struct {
    Name       string        `json:"name"`
    URL        string        `json:"url"`
    Status     string        `json:"status"` // "up" or "down"
    StatusCode int           `json:"status_code"`
    Latency    time.Duration `json:"latency"`
    Error      string        `json:"error,omitempty"`
    CheckedAt  time.Time     `json:"checked_at"`
}

var DefaultChecks = []CheckConfig{
    {
        Name:       "homepage",
        URL:        "https://dailywatch.video",
        Method:     "GET",
        Interval:   5 * time.Minute,
        Timeout:    10 * time.Second,
        ExpectCode: 200,
    },
    {
        Name:       "search_api",
        URL:        "https://dailywatch.video/search?q=test&format=json",
        Method:     "GET",
        Interval:   10 * time.Minute,
        Timeout:    15 * time.Second,
        ExpectCode: 200,
        ExpectBody: "results",
    },
    {
        Name:       "sitemap",
        URL:        "https://dailywatch.video/sitemap.xml",
        Method:     "GET",
        Interval:   30 * time.Minute,
        Timeout:    10 * time.Second,
        ExpectCode: 200,
        ExpectBody: "<urlset",
    },
}

The Health Checker

Perform checks concurrently:

package monitor

import (
    "fmt"
    "io"
    "net/http"
    "strings"
    "sync"
    "time"
)

type HealthChecker struct {
    client  *http.Client
    results map[string]*CheckResult
    history map[string][]CheckResult
    mu      sync.RWMutex
}

func NewHealthChecker() *HealthChecker {
    return &HealthChecker{
        client:  &http.Client{},
        results: make(map[string]*CheckResult),
        history: make(map[string][]CheckResult),
    }
}

func (hc *HealthChecker) RunCheck(cfg CheckConfig) CheckResult {
    hc.client.Timeout = cfg.Timeout
    start := time.Now()

    req, err := http.NewRequest(cfg.Method, cfg.URL, nil)
    if err != nil {
        return hc.recordResult(cfg.Name, cfg.URL, CheckResult{
            Name: cfg.Name, URL: cfg.URL, Status: "down",
            Error: fmt.Sprintf("request creation failed: %v", err),
            CheckedAt: time.Now(),
        })
    }
    req.Header.Set("User-Agent", "DailyWatch-Monitor/1.0")

    resp, err := hc.client.Do(req)
    latency := time.Since(start)

    if err != nil {
        return hc.recordResult(cfg.Name, cfg.URL, CheckResult{
            Name: cfg.Name, URL: cfg.URL, Status: "down",
            Latency: latency, Error: err.Error(), CheckedAt: time.Now(),
        })
    }
    defer resp.Body.Close()

    result := CheckResult{
        Name:       cfg.Name,
        URL:        cfg.URL,
        StatusCode: resp.StatusCode,
        Latency:    latency,
        CheckedAt:  time.Now(),
    }

    // Check status code
    if cfg.ExpectCode > 0 && resp.StatusCode != cfg.ExpectCode {
        result.Status = "down"
        result.Error = fmt.Sprintf("expected %d, got %d", cfg.ExpectCode, resp.StatusCode)
        return hc.recordResult(cfg.Name, cfg.URL, result)
    }

    // Check body content
    if cfg.ExpectBody != "" {
        body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) // 1MB max
        if !strings.Contains(string(body), cfg.ExpectBody) {
            result.Status = "down"
            result.Error = fmt.Sprintf("body missing expected string: %s", cfg.ExpectBody)
            return hc.recordResult(cfg.Name, cfg.URL, result)
        }
    }

    result.Status = "up"
    return hc.recordResult(cfg.Name, cfg.URL, result)
}

func (hc *HealthChecker) recordResult(name, url string, result CheckResult) CheckResult {
    hc.mu.Lock()
    defer hc.mu.Unlock()
    hc.results[name] = &result
    hc.history[name] = append(hc.history[name], result)
    // Keep last 100 results
    if len(hc.history[name]) > 100 {
        hc.history[name] = hc.history[name][len(hc.history[name])-100:]
    }
    return result
}

func (hc *HealthChecker) GetUptime(name string, window time.Duration) float64 {
    hc.mu.RLock()
    defer hc.mu.RUnlock()
    cutoff := time.Now().Add(-window)
    total, up := 0, 0
    for _, r := range hc.history[name] {
        if r.CheckedAt.After(cutoff) {
            total++
            if r.Status == "up" {
                up++
            }
        }
    }
    if total == 0 {
        return 0
    }
    return float64(up) / float64(total) * 100
}

Prometheus Metrics Endpoint

Expose metrics in Prometheus format for scraping:

package main

import (
    "fmt"
    "net/http"
    "time"
    "myapp/monitor"
)

func metricsHandler(checker *monitor.HealthChecker, checks []monitor.CheckConfig) http.HandlerFunc {
    return func(w http.ResponseWriter, r *http.Request) {
        w.Header().Set("Content-Type", "text/plain; version=0.0.4")

        for _, cfg := range checks {
            result := checker.RunCheck(cfg)
            upVal := 0
            if result.Status == "up" {
                upVal = 1
            }
            uptime24h := checker.GetUptime(cfg.Name, 24*time.Hour)

            fmt.Fprintf(w, "# HELP video_platform_up Whether the endpoint is up (1) or down (0)\n")
            fmt.Fprintf(w, "video_platform_up{name=%q,url=%q} %d\n", cfg.Name, cfg.URL, upVal)
            fmt.Fprintf(w, "video_platform_latency_ms{name=%q} %.1f\n", cfg.Name, float64(result.Latency.Milliseconds()))
            fmt.Fprintf(w, "video_platform_uptime_24h{name=%q} %.2f\n", cfg.Name, uptime24h)
        }
    }
}

func main() {
    checker := monitor.NewHealthChecker()
    checks := monitor.DefaultChecks

    // Run checks on schedule
    go func() {
        for {
            for _, cfg := range checks {
                result := checker.RunCheck(cfg)
                if result.Status == "down" {
                    fmt.Printf("ALERT: %s is DOWN - %s\n", result.Name, result.Error)
                }
            }
            time.Sleep(5 * time.Minute)
        }
    }()

    http.HandleFunc("/metrics", metricsHandler(checker, checks))
    http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
        w.Write([]byte("OK"))
    })

    fmt.Println("Monitor running on :9090")
    http.ListenAndServe(":9090", nil)
}

Scrape with Prometheus

Add to your prometheus.yml:

scrape_configs:
  - job_name: 'dailywatch'
    scrape_interval: 5m
    static_configs:
      - targets: ['localhost:9090']

This monitoring setup keeps us informed about the health of DailyWatch without any external dependencies. The single Go binary uses under 10MB of RAM and catches downtime within minutes. When an endpoint goes down, we see it in Grafana dashboards and can set up Alertmanager rules for Slack or email notifications.

This article is part of the Building DailyWatch series. Check out DailyWatch to see these techniques in action.