Building a Self-Healing API Monitor in 100 Lines of Node.js
Target: Honeybadger ($200) / Draft.dev ($300-500) | ~2,500 words
What "Self-Healing" Actually Means
Production systems fail. APIs go down, timeouts spike, third-party services flake. The question isn't whether your API will break — it's how long it stays broken before you notice and fix it.
Most monitoring solutions alert you when something breaks. A self-healing monitor takes it further: it detects the failure, diagnoses the likely cause, attempts a predefined recovery action, and only pages you if recovery fails.
This guide builds a self-healing API monitor in under 100 lines of Node.js that:
- Polls a list of endpoints on configurable intervals
- Detects failures (timeouts, 5xx errors, response validation failures)
- Executes recovery actions (restart a service, clear a cache, trigger a webhook)
- Applies exponential backoff before alerting
- Sends alerts to Slack when human intervention is required
No dependencies beyond Node.js 18's built-in fetch. Let's build it.
The Architecture in One Diagram
[Config File]
|
v
[Monitor Loop] ---> [Check Endpoint] ---> OK? ---> log, reset state
^ |
| FAIL?
| |
| [Recovery Action]
| |
| Still failing after N retries?
| |
| [Send Alert] ---> Slack / webhook
|
+--- repeat every interval
The key design decision: recovery actions run before alerting. This handles the 80% of cases that are transient (a container restart, a cache warm-up) without waking anyone up at 3am.
The Config File
{
"checks": [
{
"name": "Auth API",
"url": "https://api.example.com/health",
"method": "GET",
"expectedStatus": 200,
"expectedBody": { "status": "ok" },
"timeoutMs": 5000,
"intervalMs": 30000,
"recovery": {
"type": "webhook",
"url": "https://hooks.example.com/restart-auth-service",
"method": "POST"
},
"maxRetries": 3,
"backoffMs": 10000
},
{
"name": "Payment Service",
"url": "https://payments.example.com/ping",
"method": "GET",
"expectedStatus": 200,
"timeoutMs": 3000,
"intervalMs": 60000,
"recovery": {
"type": "webhook",
"url": "https://hooks.example.com/clear-payment-cache",
"method": "POST"
},
"maxRetries": 2,
"backoffMs": 15000
}
],
"alerts": {
"slack": {
"webhookUrl": "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
}
}
}
The Full Implementation
Here it is — the complete monitor in one file, just over 100 lines:
// monitor.js
import { readFile } from 'fs/promises';
const config = JSON.parse(await readFile('monitor.json', 'utf8'));
// State per check: failure count, in-recovery flag
const state = Object.fromEntries(
config.checks.map(c => [c.name, { failures: 0, recovering: false, lastStatus: 'unknown' }])
);
async function checkEndpoint(check) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), check.timeoutMs ?? 5000);
try {
const res = await fetch(check.url, {
method: check.method ?? 'GET',
signal: controller.signal,
});
clearTimeout(timer);
if (res.status !== (check.expectedStatus ?? 200)) {
throw new Error(`Expected ${check.expectedStatus}, got ${res.status}`);
}
if (check.expectedBody) {
const body = await res.json();
for (const [key, val] of Object.entries(check.expectedBody)) {
if (body[key] !== val) {
throw new Error(`Body mismatch: expected ${key}=${val}, got ${body[key]}`);
}
}
}
return { ok: true };
} catch (err) {
clearTimeout(timer);
return { ok: false, error: err.name === 'AbortError' ? 'Timeout' : err.message };
}
}
async function runRecovery(check) {
if (!check.recovery) return false;
console.log(`[${check.name}] Running recovery: ${check.recovery.type}`);
try {
const res = await fetch(check.recovery.url, {
method: check.recovery.method ?? 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ check: check.name, timestamp: new Date().toISOString() }),
signal: AbortSignal.timeout(10000),
});
return res.ok;
} catch (err) {
console.error(`[${check.name}] Recovery failed: ${err.message}`);
return false;
}
}
async function sendAlert(check, error, recovered) {
const webhook = config.alerts?.slack?.webhookUrl;
if (!webhook) return;
const color = recovered ? 'good' : 'danger';
const text = recovered
? `:white_check_mark: *${check.name}* recovered after auto-recovery`
: `:red_circle: *${check.name}* is DOWN — manual intervention required\n>${error}`;
try {
await fetch(webhook, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ attachments: [{ color, text, ts: Date.now() / 1000 }] }),
});
} catch (err) {
console.error('Alert send failed:', err.message);
}
}
async function runCheck(check) {
const s = state[check.name];
const result = await checkEndpoint(check);
if (result.ok) {
if (s.failures > 0) {
console.log(`[${check.name}] ✓ Recovered`);
if (s.recovering) await sendAlert(check, null, true);
}
s.failures = 0;
s.recovering = false;
s.lastStatus = 'ok';
return;
}
s.failures++;
s.lastStatus = 'failing';
console.warn(`[${check.name}] ✗ Failure #${s.failures}: ${result.error}`);
if (s.failures === 1 && !s.recovering) {
s.recovering = true;
const recovered = await runRecovery(check);
if (recovered) {
console.log(`[${check.name}] Recovery action triggered, waiting...`);
return;
}
}
if (s.failures >= (check.maxRetries ?? 3)) {
await sendAlert(check, result.error, false);
s.failures = 0; // reset to avoid alert spam
s.recovering = false;
}
}
function scheduleCheck(check) {
const interval = check.intervalMs ?? 30000;
const jitter = Math.random() * 1000; // prevent thundering herd
setTimeout(async () => {
await runCheck(check);
scheduleCheck(check); // reschedule after completion
}, interval + jitter);
}
// Initial run + schedule
console.log(`Starting monitor for ${config.checks.length} endpoints`);
for (const check of config.checks) {
runCheck(check); // run immediately
scheduleCheck(check);
}
That's it. 97 lines of pure Node.js 18+, no npm install needed.
Breaking Down the Key Decisions
Why AbortController Instead of a Timeout Library
Node.js 18's fetch supports signal: controller.signal natively. We set a timeout, abort the request if it exceeds it, and check for AbortError in the catch block. This is more reliable than third-party timeout wrappers because it cancels the underlying TCP connection rather than just ignoring the response.
Node 18 also introduced AbortSignal.timeout(ms) for one-liners:
const res = await fetch(url, { signal: AbortSignal.timeout(5000) });
We use the manual AbortController pattern in checkEndpoint to be explicit, but AbortSignal.timeout is fine for recovery requests where you just want a hard cutoff.
Jitter on Scheduled Checks
const jitter = Math.random() * 1000;
If you monitor 20 endpoints all set to 30-second intervals and they all start at the same time, they'll spike your monitored services in lockstep. Random jitter of up to 1 second spreads the load. For high-frequency checks, increase jitter proportionally.
Reschedule After Completion, Not on Fixed Intervals
setTimeout(async () => {
await runCheck(check); // wait for check to complete
scheduleCheck(check); // THEN schedule next
}, interval);
Using setInterval would schedule the next check regardless of whether the current one finished. If a check takes longer than its interval (flapping API with slow timeouts), you'd pile up concurrent checks. The recursive setTimeout pattern ensures at least intervalMs between check completions.
Alert Reset After maxRetries
s.failures = 0; // reset to avoid alert spam
After alerting, we reset the failure counter. This means if the endpoint keeps failing, we'll alert again after another maxRetries cycle. This is intentional: one alert is a notification, repeated alerts every N*30s indicate ongoing incidents.
Adding Recovery Types
The current implementation supports webhook recovery. Let's add two more: exec (run a local command) and redis-flush (clear a Redis cache):
// Extended recovery handler
async function runRecovery(check) {
if (!check.recovery) return false;
switch (check.recovery.type) {
case 'webhook':
return runWebhookRecovery(check);
case 'exec':
return runExecRecovery(check);
case 'redis-flush':
return runRedisFlushRecovery(check);
default:
console.warn(`Unknown recovery type: ${check.recovery.type}`);
return false;
}
}
async function runExecRecovery(check) {
const { exec } = await import('child_process');
const { promisify } = await import('util');
const execAsync = promisify(exec);
try {
const { stdout } = await execAsync(check.recovery.command, { timeout: 15000 });
console.log(`[${check.name}] Exec recovery output: ${stdout.trim()}`);
return true;
} catch (err) {
console.error(`[${check.name}] Exec recovery failed: ${err.message}`);
return false;
}
}
async function runRedisFlushRecovery(check) {
// Requires 'redis' package: npm i redis
const { createClient } = await import('redis');
const client = createClient({ url: check.recovery.redisUrl });
await client.connect();
await client.flushDb();
await client.disconnect();
console.log(`[${check.name}] Redis flushed`);
return true;
}
Config example for exec recovery:
{
"recovery": {
"type": "exec",
"command": "docker restart my-api-container"
}
}
Adding Response Time Tracking
Monitoring uptime alone isn't enough — a slow API that technically responds is still broken. Add latency tracking with a threshold check:
async function checkEndpoint(check) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), check.timeoutMs ?? 5000);
const startTime = Date.now();
try {
const res = await fetch(check.url, {
method: check.method ?? 'GET',
signal: controller.signal,
});
clearTimeout(timer);
const latencyMs = Date.now() - startTime;
// Check for slow responses even if status is 200
if (check.maxLatencyMs && latencyMs > check.maxLatencyMs) {
throw new Error(`Response too slow: ${latencyMs}ms (max: ${check.maxLatencyMs}ms)`);
}
// ... rest of checks
return { ok: true, latencyMs };
} catch (err) {
clearTimeout(timer);
return { ok: false, error: err.name === 'AbortError' ? 'Timeout' : err.message };
}
}
Add "maxLatencyMs": 2000 to any check config to catch performance regressions.
Persisting State Across Restarts
Currently, state lives in memory — a monitor restart resets all failure counts. For production, persist state to a JSON file:
const STATE_FILE = '.monitor-state.json';
async function loadState() {
try {
return JSON.parse(await readFile(STATE_FILE, 'utf8'));
} catch {
return {};
}
}
async function saveState() {
await writeFile(STATE_FILE, JSON.stringify(state, null, 2));
}
// Call saveState() after each runCheck()
Running as a Systemd Service
For production deployment on Linux:
# /etc/systemd/system/api-monitor.service
[Unit]
Description=Self-Healing API Monitor
After=network.target
[Service]
Type=simple
User=deploy
WorkingDirectory=/opt/monitor
ExecStart=/usr/bin/node --experimental-vm-modules monitor.js
Restart=always
RestartSec=10
Environment=NODE_ENV=production
[Install]
WantedBy=multi-user.target
sudo systemctl enable api-monitor
sudo systemctl start api-monitor
sudo journalctl -u api-monitor -f
Testing the Monitor
Write tests that mock fetch to simulate failures and recoveries:
// monitor.test.js (Node.js built-in test runner)
import { test, mock } from 'node:test';
import assert from 'node:assert/strict';
test('marks endpoint as failing on 500', async () => {
const originalFetch = global.fetch;
global.fetch = async () => ({ ok: false, status: 500 });
const result = await checkEndpoint({
url: 'http://test.example.com',
expectedStatus: 200,
timeoutMs: 1000,
});
assert.equal(result.ok, false);
assert.match(result.error, /Expected 200, got 500/);
global.fetch = originalFetch;
});
test('passes on matching response body', async () => {
global.fetch = async () => ({
ok: true,
status: 200,
json: async () => ({ status: 'ok', version: '2.1.0' }),
});
const result = await checkEndpoint({
url: 'http://test.example.com',
expectedStatus: 200,
expectedBody: { status: 'ok' },
timeoutMs: 1000,
});
assert.equal(result.ok, true);
});
Run with node --test monitor.test.js.
What to Monitor Beyond HTTP Endpoints
The pattern extends naturally:
TCP port checks — is your database port open?
import net from 'net';
function checkTCPPort(host, port, timeoutMs = 3000) {
return new Promise((resolve) => {
const socket = net.createConnection({ host, port });
const timer = setTimeout(() => { socket.destroy(); resolve(false); }, timeoutMs);
socket.on('connect', () => { clearTimeout(timer); socket.destroy(); resolve(true); });
socket.on('error', () => { clearTimeout(timer); resolve(false); });
});
}
DNS resolution — is your domain resolving?
import dns from 'dns/promises';
async function checkDNS(hostname) {
try {
await dns.resolve4(hostname);
return true;
} catch { return false; }
}
SSL certificate expiry — is your cert expiring soon?
import tls from 'tls';
function checkSSLExpiry(hostname, port = 443) {
return new Promise((resolve, reject) => {
const socket = tls.connect({ host: hostname, port, servername: hostname });
socket.on('secureConnect', () => {
const cert = socket.getPeerCertificate();
const expiresAt = new Date(cert.valid_to);
const daysUntilExpiry = Math.floor((expiresAt - Date.now()) / 86400000);
socket.destroy();
resolve(daysUntilExpiry);
});
socket.on('error', reject);
});
}
Conclusion
This 100-line monitor handles the core loop of production reliability engineering: detect, attempt recovery, alert if recovery fails. It's opinionated by design — no UI, no database, no npm dependencies. Just a config file and a Node.js process.
The real power is in the recovery actions. When you know your auth service occasionally gets into a broken state that a container restart fixes, encode that knowledge in your config. Let the machine fix the 80% of incidents that have known solutions, and page humans only for the genuinely novel failures.
Start with one endpoint and one recovery action. Observe it for a week. Add more. Within a month you'll have a map of your system's failure modes in code, and an ops runbook that runs itself.
Wilson Xu builds Node.js tooling and writes about production engineering. Find him on GitHub as @chengyixu.
Top comments (0)