Self-Hosted Push Notifications Specification
Part 6: Monitoring, Debugging & Troubleshooting
Version: 1.0
Last Updated: October 2025
Prerequisites: Part 5: Advanced Features & Production Deployment
Author: Bunty9
License: MIT (Free to use and adapt)
Table of Contents
- Metrics & Monitoring
- Logging Best Practices
- Performance Profiling
- Health Checks
- Error Tracking
- Alerting Strategies
- Debugging Tools
- Common Issues & Solutions
- Incident Response
- Analytics & Reporting
Metrics & Monitoring
Key Metrics to Track
1. Delivery Metrics
-- Overall delivery rate (last 24 hours)
SELECT
status,
COUNT(*) as count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM push_notification_logs
WHERE sent_at > NOW() - INTERVAL '24 hours'
GROUP BY status;
-- Expected result:
-- status | count | percentage
-- --------+-------+-----------
-- sent | 8,543 | 95.23
-- failed | 234 | 2.61
-- expired | 193 | 2.16
Target: >90% success rate
2. Active Subscriptions
-- Active subscriptions by user type
SELECT
CASE
WHEN user_id IS NOT NULL THEN 'users'
WHEN admin_id IS NOT NULL THEN 'admins'
END as type,
COUNT(*) as active_subscriptions,
COUNT(DISTINCT COALESCE(user_id, admin_id)) as unique_accounts,
ROUND(AVG(failure_count), 2) as avg_failure_count
FROM push_subscriptions
WHERE is_active = TRUE
GROUP BY type;
Target: Failure count < 1.0
3. Device Distribution
-- Browser and OS distribution
SELECT
browser,
os,
device_type,
COUNT(*) as count
FROM push_subscriptions
WHERE is_active = TRUE
GROUP BY browser, os, device_type
ORDER BY count DESC;
4. Notification Volume
-- Notifications sent per hour (last 24 hours)
SELECT
DATE_TRUNC('hour', sent_at) as hour,
COUNT(*) as notifications_sent,
COUNT(DISTINCT subscription_id) as unique_devices,
COUNT(*) FILTER (WHERE status = 'sent') as successful,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM push_notification_logs
WHERE sent_at > NOW() - INTERVAL '24 hours'
GROUP BY hour
ORDER BY hour DESC;
Prometheus Metrics
Go Backend Metrics
// metrics/push_metrics.go
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// Notification counters
NotificationsSent = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "push_notifications_sent_total",
Help: "Total number of push notifications sent",
},
[]string{"status", "type"},
)
NotificationDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "push_notification_duration_seconds",
Help: "Duration of push notification sending",
Buckets: prometheus.DefBuckets,
},
[]string{"status"},
)
// Subscription metrics
ActiveSubscriptions = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "push_subscriptions_active",
Help: "Number of active push subscriptions",
},
[]string{"user_type"},
)
SubscriptionFailures = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "push_subscription_failures_total",
Help: "Total number of subscription failures",
},
[]string{"reason"},
)
// Worker metrics
WorkerRuns = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "push_worker_runs_total",
Help: "Total number of worker executions",
},
[]string{"worker", "status"},
)
WorkerDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "push_worker_duration_seconds",
Help: "Duration of worker execution",
Buckets: prometheus.DefBuckets,
},
[]string{"worker"},
)
)
// Usage in PushService
func (ps *PushService) sendToSubscription(subscription models.PushSubscription, payload types.PushPayload) error {
start := time.Now()
err := ps.doSend(subscription, payload)
status := "success"
if err != nil {
status = "error"
}
// Record metrics
NotificationsSent.WithLabelValues(status, payload.Type).Inc()
NotificationDuration.WithLabelValues(status).Observe(time.Since(start).Seconds())
return err
}
Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'user-service'
static_configs:
- targets: ['user-service:8081']
metrics_path: /metrics
- job_name: 'admin-service'
static_configs:
- targets: ['admin-service:8082']
metrics_path: /metrics
Grafana Dashboards
Push Notifications Dashboard
{
"dashboard": {
"title": "Push Notifications Monitoring",
"panels": [
{
"title": "Delivery Rate",
"type": "graph",
"targets": [
{
"expr": "rate(push_notifications_sent_total{status=\"success\"}[5m])",
"legendFormat": "Successful"
},
{
"expr": "rate(push_notifications_sent_total{status=\"error\"}[5m])",
"legendFormat": "Failed"
}
]
},
{
"title": "Active Subscriptions",
"type": "stat",
"targets": [
{
"expr": "sum(push_subscriptions_active)"
}
]
},
{
"title": "Notification Latency (p50, p95, p99)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(push_notification_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(push_notification_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(push_notification_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
]
},
{
"title": "Worker Execution Duration",
"type": "graph",
"targets": [
{
"expr": "rate(push_worker_duration_seconds_sum[5m]) / rate(push_worker_duration_seconds_count[5m])",
"legendFormat": "{{worker}}"
}
]
}
]
}
}
Logging Best Practices
Structured Logging
// Use structured logging with logrus
import (
log "github.com/sirupsen/logrus"
)
func init() {
// Set JSON formatter for production
log.SetFormatter(&log.JSONFormatter{})
// Set log level from environment
level, err := log.ParseLevel(os.Getenv("LOG_LEVEL"))
if err != nil {
level = log.InfoLevel
}
log.SetLevel(level)
}
// Structured log entries
func (ps *PushService) sendToSubscription(subscription models.PushSubscription, payload types.PushPayload) error {
log.WithFields(log.Fields{
"subscription_id": subscription.ID,
"user_id": subscription.UserID,
"device_name": subscription.DeviceName,
"notification_type": payload.Type,
}).Info("Sending push notification")
err := ps.doSend(subscription, payload)
if err != nil {
log.WithFields(log.Fields{
"subscription_id": subscription.ID,
"error": err.Error(),
"notification_type": payload.Type,
}).Error("Failed to send push notification")
} else {
log.WithFields(log.Fields{
"subscription_id": subscription.ID,
"notification_type": payload.Type,
"duration_ms": time.Since(start).Milliseconds(),
}).Info("Push notification sent successfully")
}
return err
}
Log Aggregation
Fluentd Configuration:
# fluentd.conf
<source>
@type tail
path /var/log/user-service/*.log
pos_file /var/log/td-agent/user-service.log.pos
tag user-service
<parse>
@type json
time_key time
time_format %Y-%m-%dT%H:%M:%S.%NZ
</parse>
</source>
<filter user-service>
@type record_transformer
<record>
service ${tag}
environment production
</record>
</filter>
<match user-service>
@type elasticsearch
host elasticsearch
port 9200
logstash_format true
logstash_prefix user-service
include_tag_key true
type_name _doc
</match>
Performance Profiling
Go pprof Integration
// Enable pprof endpoints
import (
_ "net/http/pprof"
)
func main() {
// Start pprof server on separate port
go func() {
log.Println("pprof server listening on :6060")
log.Println(http.ListenAndServe(":6060", nil))
}()
// Start main application...
}
Profiling Commands
# CPU profiling
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
# Memory profiling
go tool pprof http://localhost:6060/debug/pprof/heap
# Goroutine profiling
go tool pprof http://localhost:6060/debug/pprof/goroutine
# View profile in web browser
go tool pprof -http=:8080 profile.pb.gz
Profiling Analysis
// Add custom profiling markers
import "runtime/trace"
func (ps *PushService) SendBatch(subscriptions []models.PushSubscription, payload types.PushPayload) error {
ctx, task := trace.NewTask(context.Background(), "SendBatch")
defer task.End()
trace.Logf(ctx, "info", "Processing %d subscriptions", len(subscriptions))
// Process batch...
return nil
}
Health Checks
Comprehensive Health Endpoint
// controllers/health_controller.go
type HealthResponse struct {
Status string `json:"status"`
Timestamp time.Time `json:"timestamp"`
Checks map[string]HealthCheck `json:"checks"`
}
type HealthCheck struct {
Status string `json:"status"`
Message string `json:"message,omitempty"`
}
func (hc *HealthController) CheckHealth(c *gin.Context) {
response := HealthResponse{
Timestamp: time.Now(),
Checks: make(map[string]HealthCheck),
}
// Check database connection
sqlDB, err := hc.db.DB()
if err != nil {
response.Checks["database"] = HealthCheck{
Status: "unhealthy",
Message: err.Error(),
}
} else {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if err := sqlDB.PingContext(ctx); err != nil {
response.Checks["database"] = HealthCheck{
Status: "unhealthy",
Message: err.Error(),
}
} else {
response.Checks["database"] = HealthCheck{Status: "healthy"}
}
}
// Check VAPID configuration
if os.Getenv("VAPID_PRIVATE_KEY") == "" || os.Getenv("VAPID_PUBLIC_KEY") == "" {
response.Checks["vapid"] = HealthCheck{
Status: "unhealthy",
Message: "VAPID keys not configured",
}
} else {
response.Checks["vapid"] = HealthCheck{Status: "healthy"}
}
// Check active subscriptions
var activeCount int64
hc.db.Model(&models.PushSubscription{}).Where("is_active = ?", true).Count(&activeCount)
response.Checks["subscriptions"] = HealthCheck{
Status: "healthy",
Message: fmt.Sprintf("%d active subscriptions", activeCount),
}
// Check worker status (optional)
if hc.workerStatus != nil && !hc.workerStatus.IsRunning() {
response.Checks["worker"] = HealthCheck{
Status: "degraded",
Message: "BookingReminderWorker not running",
}
} else {
response.Checks["worker"] = HealthCheck{Status: "healthy"}
}
// Determine overall status
allHealthy := true
for _, check := range response.Checks {
if check.Status != "healthy" {
allHealthy = false
break
}
}
if allHealthy {
response.Status = "healthy"
c.JSON(http.StatusOK, response)
} else {
response.Status = "unhealthy"
c.JSON(http.StatusServiceUnavailable, response)
}
}
Kubernetes Liveness & Readiness
apiVersion: v1
kind: Pod
spec:
containers:
- name: user-service
livenessProbe:
httpGet:
path: /health
port: 8081
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8081
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
Error Tracking
Sentry Integration
// Initialize Sentry
import "github.com/getsentry/sentry-go"
func initSentry() {
err := sentry.Init(sentry.ClientOptions{
Dsn: os.Getenv("SENTRY_DSN"),
Environment: os.Getenv("ENV"),
Release: "user-service@1.0.0",
})
if err != nil {
log.Fatalf("Sentry init failed: %v", err)
}
}
// Capture errors
func (ps *PushService) sendToSubscription(subscription models.PushSubscription, payload types.PushPayload) error {
err := ps.doSend(subscription, payload)
if err != nil {
// Report to Sentry
sentry.WithScope(func(scope *sentry.Scope) {
scope.SetContext("subscription", map[string]interface{}{
"id": subscription.ID,
"device_name": subscription.DeviceName,
"browser": subscription.Browser,
})
scope.SetContext("notification", map[string]interface{}{
"type": payload.Type,
"title": payload.Title,
})
sentry.CaptureException(err)
})
}
return err
}
Alerting Strategies
Alert Rules
# prometheus-alerts.yml
groups:
- name: push_notifications
interval: 30s
rules:
# High failure rate
- alert: PushNotificationFailureRateHigh
expr: |
(
rate(push_notifications_sent_total{status="error"}[5m])
/
rate(push_notifications_sent_total[5m])
) > 0.10
for: 5m
labels:
severity: critical
annotations:
summary: "Push notification failure rate > 10%"
description: "{{ $value | humanizePercentage }} of push notifications are failing"
# No notifications sent
- alert: PushNotificationsNotSending
expr: |
rate(push_notifications_sent_total[10m]) == 0
for: 10m
labels:
severity: warning
annotations:
summary: "No push notifications sent in last 10 minutes"
# Subscription count dropping
- alert: ActiveSubscriptionsDropping
expr: |
(
push_subscriptions_active - push_subscriptions_active offset 1h
) < -100
for: 5m
labels:
severity: warning
annotations:
summary: "Active subscriptions dropped by {{ $value }}"
# Worker not running
- alert: BookingReminderWorkerDown
expr: |
push_worker_runs_total{worker="booking_reminder"} offset 10m
==
push_worker_runs_total{worker="booking_reminder"}
for: 10m
labels:
severity: critical
annotations:
summary: "BookingReminderWorker hasn't run in 10 minutes"
# High latency
- alert: PushNotificationLatencyHigh
expr: |
histogram_quantile(0.95, rate(push_notification_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Push notification p95 latency > 5s"
Alertmanager Configuration
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'pagerduty'
continue: true
- match:
severity: warning
receiver: 'slack'
receivers:
- name: 'default'
email_configs:
- to: 'alerts@yourdomain.com'
- name: 'slack'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
channel: '#alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
Debugging Tools
Database Query Debugging
-- Find subscriptions with high failure counts
SELECT
id,
COALESCE(user_id::TEXT, admin_id::TEXT) as owner_id,
device_name,
browser,
failure_count,
last_used_at,
created_at
FROM push_subscriptions
WHERE failure_count >= 2
ORDER BY failure_count DESC, last_used_at DESC
LIMIT 20;
-- Check recent failed notifications
SELECT
pnl.id,
pnl.title,
pnl.status,
pnl.error_message,
pnl.http_status_code,
pnl.sent_at,
ps.device_name,
ps.browser
FROM push_notification_logs pnl
LEFT JOIN push_subscriptions ps ON pnl.subscription_id = ps.id
WHERE pnl.status = 'failed'
AND pnl.sent_at > NOW() - INTERVAL '1 hour'
ORDER BY pnl.sent_at DESC
LIMIT 50;
-- Analyze notification volume by hour
SELECT
DATE_TRUNC('hour', sent_at) as hour,
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'sent') as sent,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
ROUND(COUNT(*) FILTER (WHERE status = 'sent')::NUMERIC * 100 / COUNT(*), 2) as success_rate
FROM push_notification_logs
WHERE sent_at > NOW() - INTERVAL '24 hours'
GROUP BY hour
ORDER BY hour DESC;
Frontend Debugging
// Enable verbose logging
localStorage.setItem('debug_push_notifications', 'true');
// Check subscription status
navigator.serviceWorker.ready.then(async reg => {
const subscription = await reg.pushManager.getSubscription();
console.log('Current subscription:', subscription);
if (subscription) {
console.log('Endpoint:', subscription.endpoint);
console.log('Keys:', {
p256dh: subscription.toJSON().keys.p256dh,
auth: subscription.toJSON().keys.auth
});
}
});
// Monitor service worker messages
navigator.serviceWorker.addEventListener('message', event => {
console.log('[SW Message]', event.data);
});
// Test push notification
async function testPushNotification() {
const registration = await navigator.serviceWorker.ready;
await registration.showNotification('Test Notification', {
body: 'This is a test',
icon: '/icon-192x192.png',
data: { url: '/' }
});
}
Common Issues & Solutions
Issue 1: Notifications Not Received
Symptoms:
- Backend logs show "sent" status
- User doesn't receive notification
Debugging:
# Check browser notification permission
# Browser Console:
Notification.permission
# Check service worker status
navigator.serviceWorker.ready.then(reg => console.log(reg.active));
# Check subscription exists in backend
curl -X GET http://localhost:8081/api/push/devices \
-H "Cookie: auth-token=YOUR_TOKEN"
Solutions:
- Permission denied: User must grant permission in browser settings
-
Service worker not registered: Ensure
sw.js
exists at/public/sw.js
- Subscription expired: User needs to resubscribe
Issue 2: High Failure Rate
Symptoms:
- >10% of notifications failing
-
failure_count
increasing for subscriptions
Debugging:
-- Check error messages
SELECT
error_message,
http_status_code,
COUNT(*) as occurrences
FROM push_notification_logs
WHERE status = 'failed'
AND sent_at > NOW() - INTERVAL '24 hours'
GROUP BY error_message, http_status_code
ORDER BY occurrences DESC;
Common Causes & Solutions:
HTTP Status | Cause | Solution |
---|---|---|
410 Gone | Subscription expired | Deactivate subscription (auto-handled) |
403 Forbidden | Invalid VAPID keys | Verify VAPID configuration |
400 Bad Request | Malformed payload | Check payload format |
429 Too Many Requests | Rate limit hit | Implement backoff |
500 Server Error | Push service issue | Retry with exponential backoff |
Issue 3: Worker Not Running
Symptoms:
- Reminder notifications not sent
- Worker metrics flat
Debugging:
# Check service logs
kubectl logs -f deployment/user-service | grep WorkerGoroutine
# Expected:
# [BookingReminderWorker] Started successfully
# [BookingReminderWorker] Processing 3 bookings
# Check if multiple instances are running
kubectl get pods -l app=user-service
Solutions:
-
Restart service:
kubectl rollout restart deployment/user-service
- Check leader election: Ensure only one worker instance has leadership
- Verify database connection: Worker requires database access
Issue 4: Memory Leak
Symptoms:
- Memory usage increasing over time
- OOM kills in Kubernetes
Debugging:
# Memory profiling
go tool pprof http://localhost:6060/debug/pprof/heap
# Check goroutine leaks
go tool pprof http://localhost:6060/debug/pprof/goroutine
# Kubernetes memory usage
kubectl top pod -l app=user-service
Common Causes:
- Goroutine leaks: Not closing database connections
- Unbounded caches: No cache eviction policy
- Large payloads: Storing too much in memory
Solutions:
// Use context with timeout
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// Close database connections
defer db.Close()
// Use bounded channels
queue := make(chan *PushNotification, 1000)
Incident Response
Incident Response Playbook
1. Detection
- Alert fires in Prometheus/Alertmanager
- User reports issue
- Monitoring dashboard shows anomaly
2. Assessment
# Check service health
curl http://api.yourdomain.com/health
# Check recent errors
kubectl logs -f deployment/user-service --tail=100 | grep ERROR
# Check metrics
# Visit Grafana dashboard: http://grafana.yourdomain.com/d/push-notifications
3. Mitigation
High Failure Rate:
# Check VAPID keys
kubectl get secret vapid-keys -o jsonpath='{.data.public-key}' | base64 -d
# Restart services
kubectl rollout restart deployment/user-service
kubectl rollout restart deployment/admin-service
Database Connection Issues:
# Check database health
kubectl exec -it deployment/postgres -- psql -U push_service -c "SELECT 1;"
# Scale down workers temporarily
kubectl scale deployment/user-service --replicas=1
High Latency:
# Check active goroutines
curl http://localhost:6060/debug/pprof/goroutine?debug=1
# Scale up if needed
kubectl scale deployment/user-service --replicas=5
4. Communication
- Update status page
- Notify stakeholders
- Document in incident log
5. Resolution
- Apply fix
- Verify metrics returned to normal
- Write postmortem
Analytics & Reporting
Weekly Report Query
-- Weekly push notification report
WITH weekly_stats AS (
SELECT
DATE_TRUNC('week', sent_at) as week,
COUNT(*) as total_sent,
COUNT(*) FILTER (WHERE status = 'sent') as successful,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(DISTINCT subscription_id) as unique_devices,
COUNT(DISTINCT COALESCE(user_id, admin_id)) as unique_users
FROM push_notification_logs
WHERE sent_at > NOW() - INTERVAL '4 weeks'
GROUP BY week
)
SELECT
week,
total_sent,
successful,
failed,
ROUND(successful::NUMERIC * 100 / total_sent, 2) as success_rate,
unique_devices,
unique_users
FROM weekly_stats
ORDER BY week DESC;
Export to CSV
# Export last 7 days of notification data
psql -h localhost -U postgres -d ohhspaces_prod -c "
COPY (
SELECT
pnl.sent_at,
pnl.title,
pnl.status,
ps.device_name,
ps.browser,
ps.os
FROM push_notification_logs pnl
LEFT JOIN push_subscriptions ps ON pnl.subscription_id = ps.id
WHERE pnl.sent_at > NOW() - INTERVAL '7 days'
ORDER BY pnl.sent_at DESC
) TO STDOUT WITH CSV HEADER" > push_notifications_export.csv
Summary
You now have:
✅ Comprehensive metrics and monitoring
✅ Structured logging with aggregation
✅ Performance profiling tools
✅ Health check endpoints
✅ Error tracking integration
✅ Alerting rules and strategies
✅ Debugging tools and techniques
✅ Common issues with solutions
✅ Incident response playbook
✅ Analytics and reporting queries
Next Steps
➡️ Part 7: Best Practices, Security & Optimization
Part 7 will cover:
- Security best practices
- VAPID key management
- Input validation
- SQL injection prevention
- Performance optimization
- Code organization
- Testing strategies
- Documentation standards
Top comments (0)