Wilson Xu

Posted on Mar 20

Building a Self-Healing API Monitor in 100 Lines of Node.js

#node #api #tutorial #javascript

Building a Self-Healing API Monitor in 100 Lines of Node.js

Target: Honeybadger ($200) / Draft.dev ($300-500) | ~2,500 words

What "Self-Healing" Actually Means

Production systems fail. APIs go down, timeouts spike, third-party services flake. The question isn't whether your API will break — it's how long it stays broken before you notice and fix it.

Most monitoring solutions alert you when something breaks. A self-healing monitor takes it further: it detects the failure, diagnoses the likely cause, attempts a predefined recovery action, and only pages you if recovery fails.

This guide builds a self-healing API monitor in under 100 lines of Node.js that:

Polls a list of endpoints on configurable intervals
Detects failures (timeouts, 5xx errors, response validation failures)
Executes recovery actions (restart a service, clear a cache, trigger a webhook)
Applies exponential backoff before alerting
Sends alerts to Slack when human intervention is required

No dependencies beyond Node.js 18's built-in fetch. Let's build it.

The Architecture in One Diagram

[Config File]
     |
     v
[Monitor Loop] ---> [Check Endpoint] ---> OK? ---> log, reset state
     ^                    |
     |                 FAIL?
     |                    |
     |              [Recovery Action]
     |                    |
     |              Still failing after N retries?
     |                    |
     |               [Send Alert] ---> Slack / webhook
     |
     +--- repeat every interval

The key design decision: recovery actions run before alerting. This handles the 80% of cases that are transient (a container restart, a cache warm-up) without waking anyone up at 3am.

The Config File

{
  "checks": [
    {
      "name": "Auth API",
      "url": "https://api.example.com/health",
      "method": "GET",
      "expectedStatus": 200,
      "expectedBody": { "status": "ok" },
      "timeoutMs": 5000,
      "intervalMs": 30000,
      "recovery": {
        "type": "webhook",
        "url": "https://hooks.example.com/restart-auth-service",
        "method": "POST"
      },
      "maxRetries": 3,
      "backoffMs": 10000
    },
    {
      "name": "Payment Service",
      "url": "https://payments.example.com/ping",
      "method": "GET",
      "expectedStatus": 200,
      "timeoutMs": 3000,
      "intervalMs": 60000,
      "recovery": {
        "type": "webhook",
        "url": "https://hooks.example.com/clear-payment-cache",
        "method": "POST"
      },
      "maxRetries": 2,
      "backoffMs": 15000
    }
  ],
  "alerts": {
    "slack": {
      "webhookUrl": "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
    }
  }
}

The Full Implementation

Here it is — the complete monitor in one file, just over 100 lines:

// monitor.js
import { readFile } from 'fs/promises';

const config = JSON.parse(await readFile('monitor.json', 'utf8'));

// State per check: failure count, in-recovery flag
const state = Object.fromEntries(
  config.checks.map(c => [c.name, { failures: 0, recovering: false, lastStatus: 'unknown' }])
);

async function checkEndpoint(check) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), check.timeoutMs ?? 5000);

  try {
    const res = await fetch(check.url, {
      method: check.method ?? 'GET',
      signal: controller.signal,
    });
    clearTimeout(timer);

    if (res.status !== (check.expectedStatus ?? 200)) {
      throw new Error(`Expected ${check.expectedStatus}, got ${res.status}`);
    }

    if (check.expectedBody) {
      const body = await res.json();
      for (const [key, val] of Object.entries(check.expectedBody)) {
        if (body[key] !== val) {
          throw new Error(`Body mismatch: expected ${key}=${val}, got ${body[key]}`);
        }
      }
    }

    return { ok: true };
  } catch (err) {
    clearTimeout(timer);
    return { ok: false, error: err.name === 'AbortError' ? 'Timeout' : err.message };
  }
}

async function runRecovery(check) {
  if (!check.recovery) return false;
  console.log(`[${check.name}] Running recovery: ${check.recovery.type}`);

  try {
    const res = await fetch(check.recovery.url, {
      method: check.recovery.method ?? 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ check: check.name, timestamp: new Date().toISOString() }),
      signal: AbortSignal.timeout(10000),
    });
    return res.ok;
  } catch (err) {
    console.error(`[${check.name}] Recovery failed: ${err.message}`);
    return false;
  }
}

async function sendAlert(check, error, recovered) {
  const webhook = config.alerts?.slack?.webhookUrl;
  if (!webhook) return;

  const color = recovered ? 'good' : 'danger';
  const text = recovered
    ? `:white_check_mark: *${check.name}* recovered after auto-recovery`
    : `:red_circle: *${check.name}* is DOWN — manual intervention required\n>${error}`;

  try {
    await fetch(webhook, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ attachments: [{ color, text, ts: Date.now() / 1000 }] }),
    });
  } catch (err) {
    console.error('Alert send failed:', err.message);
  }
}

async function runCheck(check) {
  const s = state[check.name];
  const result = await checkEndpoint(check);

  if (result.ok) {
    if (s.failures > 0) {
      console.log(`[${check.name}] ✓ Recovered`);
      if (s.recovering) await sendAlert(check, null, true);
    }
    s.failures = 0;
    s.recovering = false;
    s.lastStatus = 'ok';
    return;
  }

  s.failures++;
  s.lastStatus = 'failing';
  console.warn(`[${check.name}] ✗ Failure #${s.failures}: ${result.error}`);

  if (s.failures === 1 && !s.recovering) {
    s.recovering = true;
    const recovered = await runRecovery(check);
    if (recovered) {
      console.log(`[${check.name}] Recovery action triggered, waiting...`);
      return;
    }
  }

  if (s.failures >= (check.maxRetries ?? 3)) {
    await sendAlert(check, result.error, false);
    s.failures = 0; // reset to avoid alert spam
    s.recovering = false;
  }
}

function scheduleCheck(check) {
  const interval = check.intervalMs ?? 30000;
  const jitter = Math.random() * 1000; // prevent thundering herd
  setTimeout(async () => {
    await runCheck(check);
    scheduleCheck(check); // reschedule after completion
  }, interval + jitter);
}

// Initial run + schedule
console.log(`Starting monitor for ${config.checks.length} endpoints`);
for (const check of config.checks) {
  runCheck(check); // run immediately
  scheduleCheck(check);
}

That's it. 97 lines of pure Node.js 18+, no npm install needed.

Breaking Down the Key Decisions

Why `AbortController` Instead of a Timeout Library

Node.js 18's fetch supports signal: controller.signal natively. We set a timeout, abort the request if it exceeds it, and check for AbortError in the catch block. This is more reliable than third-party timeout wrappers because it cancels the underlying TCP connection rather than just ignoring the response.

Node 18 also introduced AbortSignal.timeout(ms) for one-liners:

const res = await fetch(url, { signal: AbortSignal.timeout(5000) });

We use the manual AbortController pattern in checkEndpoint to be explicit, but AbortSignal.timeout is fine for recovery requests where you just want a hard cutoff.

Jitter on Scheduled Checks

const jitter = Math.random() * 1000;

If you monitor 20 endpoints all set to 30-second intervals and they all start at the same time, they'll spike your monitored services in lockstep. Random jitter of up to 1 second spreads the load. For high-frequency checks, increase jitter proportionally.

Reschedule After Completion, Not on Fixed Intervals

setTimeout(async () => {
  await runCheck(check); // wait for check to complete
  scheduleCheck(check);  // THEN schedule next
}, interval);

Using setInterval would schedule the next check regardless of whether the current one finished. If a check takes longer than its interval (flapping API with slow timeouts), you'd pile up concurrent checks. The recursive setTimeout pattern ensures at least intervalMs between check completions.

Alert Reset After `maxRetries`

s.failures = 0; // reset to avoid alert spam

After alerting, we reset the failure counter. This means if the endpoint keeps failing, we'll alert again after another maxRetries cycle. This is intentional: one alert is a notification, repeated alerts every N*30s indicate ongoing incidents.

Adding Recovery Types

The current implementation supports webhook recovery. Let's add two more: exec (run a local command) and redis-flush (clear a Redis cache):

// Extended recovery handler
async function runRecovery(check) {
  if (!check.recovery) return false;

  switch (check.recovery.type) {
    case 'webhook':
      return runWebhookRecovery(check);

    case 'exec':
      return runExecRecovery(check);

    case 'redis-flush':
      return runRedisFlushRecovery(check);

    default:
      console.warn(`Unknown recovery type: ${check.recovery.type}`);
      return false;
  }
}

async function runExecRecovery(check) {
  const { exec } = await import('child_process');
  const { promisify } = await import('util');
  const execAsync = promisify(exec);

  try {
    const { stdout } = await execAsync(check.recovery.command, { timeout: 15000 });
    console.log(`[${check.name}] Exec recovery output: ${stdout.trim()}`);
    return true;
  } catch (err) {
    console.error(`[${check.name}] Exec recovery failed: ${err.message}`);
    return false;
  }
}

async function runRedisFlushRecovery(check) {
  // Requires 'redis' package: npm i redis
  const { createClient } = await import('redis');
  const client = createClient({ url: check.recovery.redisUrl });
  await client.connect();
  await client.flushDb();
  await client.disconnect();
  console.log(`[${check.name}] Redis flushed`);
  return true;
}

Config example for exec recovery:

{
  "recovery": {
    "type": "exec",
    "command": "docker restart my-api-container"
  }
}

Adding Response Time Tracking

Monitoring uptime alone isn't enough — a slow API that technically responds is still broken. Add latency tracking with a threshold check:

async function checkEndpoint(check) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), check.timeoutMs ?? 5000);
  const startTime = Date.now();

  try {
    const res = await fetch(check.url, {
      method: check.method ?? 'GET',
      signal: controller.signal,
    });
    clearTimeout(timer);
    const latencyMs = Date.now() - startTime;

    // Check for slow responses even if status is 200
    if (check.maxLatencyMs && latencyMs > check.maxLatencyMs) {
      throw new Error(`Response too slow: ${latencyMs}ms (max: ${check.maxLatencyMs}ms)`);
    }

    // ... rest of checks
    return { ok: true, latencyMs };
  } catch (err) {
    clearTimeout(timer);
    return { ok: false, error: err.name === 'AbortError' ? 'Timeout' : err.message };
  }
}

Add "maxLatencyMs": 2000 to any check config to catch performance regressions.

Persisting State Across Restarts

Currently, state lives in memory — a monitor restart resets all failure counts. For production, persist state to a JSON file:

const STATE_FILE = '.monitor-state.json';

async function loadState() {
  try {
    return JSON.parse(await readFile(STATE_FILE, 'utf8'));
  } catch {
    return {};
  }
}

async function saveState() {
  await writeFile(STATE_FILE, JSON.stringify(state, null, 2));
}

// Call saveState() after each runCheck()

Running as a Systemd Service

For production deployment on Linux:

# /etc/systemd/system/api-monitor.service
[Unit]
Description=Self-Healing API Monitor
After=network.target

[Service]
Type=simple
User=deploy
WorkingDirectory=/opt/monitor
ExecStart=/usr/bin/node --experimental-vm-modules monitor.js
Restart=always
RestartSec=10
Environment=NODE_ENV=production

[Install]
WantedBy=multi-user.target

sudo systemctl enable api-monitor
sudo systemctl start api-monitor
sudo journalctl -u api-monitor -f

Testing the Monitor

Write tests that mock fetch to simulate failures and recoveries:

// monitor.test.js (Node.js built-in test runner)
import { test, mock } from 'node:test';
import assert from 'node:assert/strict';

test('marks endpoint as failing on 500', async () => {
  const originalFetch = global.fetch;
  global.fetch = async () => ({ ok: false, status: 500 });

  const result = await checkEndpoint({
    url: 'http://test.example.com',
    expectedStatus: 200,
    timeoutMs: 1000,
  });

  assert.equal(result.ok, false);
  assert.match(result.error, /Expected 200, got 500/);
  global.fetch = originalFetch;
});

test('passes on matching response body', async () => {
  global.fetch = async () => ({
    ok: true,
    status: 200,
    json: async () => ({ status: 'ok', version: '2.1.0' }),
  });

  const result = await checkEndpoint({
    url: 'http://test.example.com',
    expectedStatus: 200,
    expectedBody: { status: 'ok' },
    timeoutMs: 1000,
  });

  assert.equal(result.ok, true);
});

Run with node --test monitor.test.js.

What to Monitor Beyond HTTP Endpoints

The pattern extends naturally:

TCP port checks — is your database port open?

import net from 'net';
function checkTCPPort(host, port, timeoutMs = 3000) {
  return new Promise((resolve) => {
    const socket = net.createConnection({ host, port });
    const timer = setTimeout(() => { socket.destroy(); resolve(false); }, timeoutMs);
    socket.on('connect', () => { clearTimeout(timer); socket.destroy(); resolve(true); });
    socket.on('error', () => { clearTimeout(timer); resolve(false); });
  });
}

DNS resolution — is your domain resolving?

import dns from 'dns/promises';
async function checkDNS(hostname) {
  try {
    await dns.resolve4(hostname);
    return true;
  } catch { return false; }
}

SSL certificate expiry — is your cert expiring soon?

import tls from 'tls';
function checkSSLExpiry(hostname, port = 443) {
  return new Promise((resolve, reject) => {
    const socket = tls.connect({ host: hostname, port, servername: hostname });
    socket.on('secureConnect', () => {
      const cert = socket.getPeerCertificate();
      const expiresAt = new Date(cert.valid_to);
      const daysUntilExpiry = Math.floor((expiresAt - Date.now()) / 86400000);
      socket.destroy();
      resolve(daysUntilExpiry);
    });
    socket.on('error', reject);
  });
}

Conclusion

This 100-line monitor handles the core loop of production reliability engineering: detect, attempt recovery, alert if recovery fails. It's opinionated by design — no UI, no database, no npm dependencies. Just a config file and a Node.js process.

The real power is in the recovery actions. When you know your auth service occasionally gets into a broken state that a container restart fixes, encode that knowledge in your config. Let the machine fix the 80% of incidents that have known solutions, and page humans only for the genuinely novel failures.

Start with one endpoint and one recovery action. Observe it for a week. Add more. Within a month you'll have a map of your system's failure modes in code, and an ops runbook that runs itself.

Wilson Xu builds Node.js tooling and writes about production engineering. Find him on GitHub as @chengyixu.

DEV Community

Building a Self-Healing API Monitor in 100 Lines of Node.js

Building a Self-Healing API Monitor in 100 Lines of Node.js

What "Self-Healing" Actually Means

The Architecture in One Diagram

The Config File

The Full Implementation

Breaking Down the Key Decisions

Why `AbortController` Instead of a Timeout Library

Jitter on Scheduled Checks

Reschedule After Completion, Not on Fixed Intervals

Alert Reset After `maxRetries`

Adding Recovery Types

Adding Response Time Tracking

Persisting State Across Restarts

Running as a Systemd Service

Testing the Monitor

What to Monitor Beyond HTTP Endpoints

Conclusion

Top comments (0)

Building a Self-Healing API Monitor in 100 Lines of Node.js

What "Self-Healing" Actually Means

The Architecture in One Diagram

The Config File

The Full Implementation

Breaking Down the Key Decisions

Why AbortController Instead of a Timeout Library

Jitter on Scheduled Checks

Reschedule After Completion, Not on Fixed Intervals

Alert Reset After maxRetries

Adding Recovery Types

Adding Response Time Tracking

Persisting State Across Restarts

Running as a Systemd Service

Testing the Monitor

What to Monitor Beyond HTTP Endpoints

Conclusion

Why `AbortController` Instead of a Timeout Library

Alert Reset After `maxRetries`