How to Detect Bot Traffic by IP Address (JavaScript)

#security #webdev #javascript #tutorial

If you run a website or API, bot traffic is a fact of life. Scrapers, credential stuffers, vulnerability scanners — they all hit your endpoints and skew your analytics. The good news: most bots are easy to detect with a few IP-based heuristics.

In this tutorial, we'll build a bot detection middleware in Node.js that analyzes incoming requests using IP geolocation, ASN data, and behavioral patterns — all powered by free APIs.

What Makes Bot Traffic Different?

Bots leave fingerprints that humans don't:

Datacenter IPs — Real users browse from ISPs like Comcast or Vodafone, not AWS or DigitalOcean
High request rates — 100 requests per minute from one IP is not human behavior
Missing headers — Bots often skip Accept-Language, Referer, or send suspicious user agents
Geographic anomalies — Traffic from countries where you have no users

The Detection Script

Here's a complete bot detection system in ~60 lines:

const http = require('http');

// In-memory rate tracker
const requestCounts = new Map();

// Known datacenter/hosting ASN keywords
const DATACENTER_KEYWORDS = [
  'amazon', 'aws', 'google cloud', 'microsoft azure', 'digitalocean',
  'linode', 'vultr', 'hetzner', 'ovh', 'cloudflare', 'oracle cloud',
  'alibaba', 'tencent', 'scaleway', 'contabo'
];

async function getIPInfo(ip) {
  const res = await fetch(
    `https://api.frostbyte.cc/api/geo/${ip}`,
    { headers: { 'X-API-Key': 'YOUR_API_KEY' } }
  );
  return res.json();
}

function checkRateLimit(ip, windowMs = 60000, maxRequests = 60) {
  const now = Date.now();
  const key = ip;

  if (!requestCounts.has(key)) {
    requestCounts.set(key, []);
  }

  const timestamps = requestCounts.get(key).filter(t => now - t < windowMs);
  timestamps.push(now);
  requestCounts.set(key, timestamps);

  return timestamps.length > maxRequests;
}

async function analyzeBotScore(req) {
  const ip = req.headers['x-forwarded-for']?.split(',')[0] ||
             req.socket.remoteAddress;

  let score = 0; // 0 = definitely human, 100 = definitely bot
  const reasons = [];

  // 1. Rate limit check
  if (checkRateLimit(ip)) {
    score += 40;
    reasons.push('high_request_rate');
  }

  // 2. User-Agent analysis
  const ua = req.headers['user-agent'] || '';
  if (!ua) {
    score += 30;
    reasons.push('missing_user_agent');
  } else if (/bot|crawl|spider|scrape|curl|wget|python|go-http/i.test(ua)) {
    score += 25;
    reasons.push('bot_user_agent');
  }

  // 3. Missing browser headers
  if (!req.headers['accept-language']) {
    score += 15;
    reasons.push('missing_accept_language');
  }

  // 4. IP geolocation analysis
  try {
    const geo = await getIPInfo(ip);

    // Check if IP is from a datacenter
    const isp = (geo.isp || '').toLowerCase();
    const org = (geo.org || '').toLowerCase();

    const isDatacenter = DATACENTER_KEYWORDS.some(kw =>
      isp.includes(kw) || org.includes(kw)
    );

    if (isDatacenter) {
      score += 30;
      reasons.push(`datacenter_ip: ${geo.isp}`);
    }

    // Check for proxy/VPN/Tor
    if (geo.proxy || geo.vpn || geo.tor) {
      score += 20;
      reasons.push('proxy_or_vpn');
    }
  } catch (e) {
    // API call failed, skip geo checks
  }

  return {
    ip,
    bot_score: Math.min(score, 100),
    is_bot: score >= 50,
    reasons
  };
}

// Example: Express-style middleware
async function botDetectionMiddleware(req, res, next) {
  const analysis = await analyzeBotScore(req);

  // Attach analysis to request for logging
  req.botAnalysis = analysis;

  if (analysis.is_bot) {
    console.log(`🤖 Bot detected: ${analysis.ip} (score: ${analysis.bot_score}) — ${analysis.reasons.join(', ')}`);

    // Option 1: Block entirely
    // return res.status(403).json({ error: 'Blocked' });

    // Option 2: Rate limit more aggressively
    // Option 3: Serve cached content
    // Option 4: Just log and continue
  }

  next();
}

How the Scoring Works

Each request gets a bot score from 0 to 100 based on multiple signals:

Signal	Score	Why
High request rate (>60/min)	+40	Humans don't click that fast
Missing User-Agent	+30	Every browser sends one
Bot-like User-Agent	+25	`curl`, `python-requests`, `Go-http-client`
Datacenter IP	+30	Real users use residential ISPs
Proxy/VPN/Tor	+20	Often used to mask automated traffic
Missing Accept-Language	+15	Browsers always send this

A score of 50+ flags the request as likely bot traffic. You can tune these thresholds based on your false positive tolerance.

Real-World Example: Analyzing Your Access Logs

Here's a script that processes nginx access logs and flags suspicious IPs:

const fs = require('fs');
const readline = require('readline');

async function analyzeAccessLog(logFile) {
  const ipStats = new Map();

  const rl = readline.createInterface({
    input: fs.createReadStream(logFile)
  });

  // Parse nginx combined log format
  for await (const line of rl) {
    const match = line.match(/^(\S+)/);
    if (!match) continue;

    const ip = match[1];
    const count = (ipStats.get(ip) || 0) + 1;
    ipStats.set(ip, count);
  }

  // Sort by request count (most suspicious first)
  const sorted = [...ipStats.entries()]
    .sort((a, b) => b[1] - a[1])
    .slice(0, 20);

  console.log('Top 20 IPs by request volume:\n');
  console.log('IP Address       | Requests | ISP/Org              | Bot?');
  console.log('-'.repeat(70));

  for (const [ip, count] of sorted) {
    try {
      const res = await fetch(
        `https://api.frostbyte.cc/api/geo/${ip}`,
        { headers: { 'X-API-Key': 'YOUR_API_KEY' } }
      );
      const geo = await res.json();

      const isp = (geo.isp || 'Unknown').substring(0, 20);
      const isDatacenter = DATACENTER_KEYWORDS.some(kw =>
        (geo.isp || '').toLowerCase().includes(kw)
      );

      const flag = isDatacenter ? '🤖' : (count > 1000 ? '⚠️' : '✅');
      console.log(
        `${ip.padEnd(16)} | ${String(count).padEnd(8)} | ${isp.padEnd(20)} | ${flag}`
      );
    } catch {
      console.log(`${ip.padEnd(16)} | ${String(count).padEnd(8)} | Error               | ❓`);
    }
  }
}

analyzeAccessLog('/var/log/nginx/access.log');

Sample output:

Top 20 IPs by request volume:

IP Address       | Requests | ISP/Org              | Bot?
----------------------------------------------------------------------
158.94.211.254   | 3113     | DataCamp Limited     | 🤖
45.33.32.156     | 891      | Linode LLC           | 🤖
203.0.113.42     | 234      | Comcast Cable        | ✅
198.51.100.17    | 189      | AT&T Services        | ✅
192.0.2.88       | 156      | Amazon Web Services  | 🤖

Going Further: Building a Bot Dashboard

Combine the detection with a simple dashboard to monitor bot traffic over time:

const express = require('express');
const app = express();

const botLog = [];

app.use(async (req, res, next) => {
  const analysis = await analyzeBotScore(req);

  botLog.push({
    timestamp: new Date().toISOString(),
    ...analysis
  });

  // Keep last 10K entries
  if (botLog.length > 10000) botLog.splice(0, botLog.length - 10000);

  next();
});

app.get('/admin/bot-stats', (req, res) => {
  const total = botLog.length;
  const bots = botLog.filter(e => e.is_bot).length;
  const humans = total - bots;

  // Top bot IPs
  const botIPs = {};
  botLog.filter(e => e.is_bot).forEach(e => {
    botIPs[e.ip] = (botIPs[e.ip] || 0) + 1;
  });

  const topBots = Object.entries(botIPs)
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10);

  res.json({
    total_requests: total,
    bot_requests: bots,
    human_requests: humans,
    bot_percentage: ((bots / total) * 100).toFixed(1) + '%',
    top_bot_ips: topBots
  });
});

app.listen(3000);

Get a Free API Key

The IP geolocation lookups use Frostbyte API — you get 200 free credits with no signup required:

curl https://api.frostbyte.cc/api/keys/create
# Returns: { "key": "fb_xxx...", "credits": 200 }

Each geolocation lookup costs 1 credit. That's 200 bot checks for free — enough to analyze your traffic patterns and decide if you need more.

Key Takeaways

Layer your signals — No single check catches all bots. Combine rate limits, headers, and IP intelligence.
Datacenter detection is powerful — Most automated traffic comes from cloud providers, not residential ISPs.
Don't just block — Consider serving cached content, adding CAPTCHAs, or rate limiting instead of hard blocks.
Monitor trends — Bot traffic patterns change. What works today might need tuning next month.

The full code from this tutorial is production-ready. Drop the middleware into any Express app and start seeing what's really hitting your endpoints.