Vhub Systems

Posted on Apr 3

Web Scraping With Node.js in 2026: Axios + Cheerio, Playwright, Crawlee

#javascript #node #webscraping #tutorial

Python dominates web scraping tutorials, but Node.js has a strong ecosystem too. If you're already building in JavaScript, you don't need to switch languages. Here's a practical guide to web scraping in Node.js in 2026.

The Node.js scraping stack

Task	Library	Notes
HTTP requests	`axios` or `got`	Simple static pages
HTML parsing	`cheerio`	jQuery-style selectors
Browser automation	`playwright` or `puppeteer`	Dynamic/JS-rendered pages
Scraping framework	`crawlee`	Full-featured, async, anti-bot
Scheduling	`node-cron`	Recurring scrapes

Static HTML scraping with Axios + Cheerio

For pages that don't require JavaScript, this combination is fast and simple:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeHackerNews() {
    const { data } = await axios.get('https://news.ycombinator.com', {
        headers: {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
        }
    });

    const $ = cheerio.load(data);
    const stories = [];

    // Cheerio uses jQuery-style selectors
    $('.athing').each((index, element) => {
        const titleEl = $(element).find('.titleline a').first();
        const scoreEl = $(`#score_${$(element).attr('id')}`);

        stories.push({
            rank: index + 1,
            title: titleEl.text(),
            url: titleEl.attr('href'),
            score: parseInt(scoreEl.text()) || 0,
        });
    });

    return stories;
}

scrapeHackerNews().then(stories => {
    console.log(`Found ${stories.length} stories`);
    stories.slice(0, 5).forEach(s => 
        console.log(`${s.rank}. ${s.title} (${s.score}p)`)
    );
});

Handling pagination

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeAllPages(baseUrl, maxPages = 10) {
    const results = [];
    let currentPage = 1;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
        const url = `${baseUrl}?page=${currentPage}`;

        try {
            const { data } = await axios.get(url, {
                headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' },
                timeout: 10000
            });

            const $ = cheerio.load(data);

            // Extract items from current page
            $('.item').each((_, el) => {
                results.push({
                    title: $(el).find('.title').text().trim(),
                    link: $(el).find('a').attr('href'),
                });
            });

            // Check if there's a next page
            hasNextPage = $('.pagination .next').length > 0;
            currentPage++;

            // Polite delay between requests
            await new Promise(r => setTimeout(r, 1000 + Math.random() * 1000));

        } catch (error) {
            console.error(`Error on page ${currentPage}: ${error.message}`);
            break;
        }
    }

    return results;
}

JavaScript-rendered pages with Playwright

For React, Vue, and other SPAs, you need a browser:

const { chromium } = require('playwright');

async function scrapeReactApp(url) {
    const browser = await chromium.launch({ headless: true });
    const context = await browser.newContext({
        userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
    });
    const page = await context.newPage();

    await page.goto(url, { waitUntil: 'networkidle' });

    // Wait for dynamic content
    await page.waitForSelector('.product-card');

    // Extract data after JS renders
    const products = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.product-card')).map(card => ({
            name: card.querySelector('.product-name')?.textContent,
            price: card.querySelector('.price')?.textContent,
            rating: card.querySelector('.rating')?.textContent,
        }));
    });

    await browser.close();
    return products;
}

scrapeReactApp('https://example.com/products')
    .then(products => console.log(JSON.stringify(products, null, 2)));

Intercepting API calls (faster than DOM parsing)

Many React apps fetch data from APIs. Intercept those instead of scraping the DOM:

const { chromium } = require('playwright');

async function interceptApiCalls(url, apiEndpoint) {
    const browser = await chromium.launch({ headless: true });
    const page = await browser.newPage();
    const apiData = [];

    // Intercept network requests
    page.on('response', async (response) => {
        if (response.url().includes(apiEndpoint) && response.status() === 200) {
            try {
                const json = await response.json();
                apiData.push(json);
            } catch (e) {
                // Response wasn't JSON
            }
        }
    });

    await page.goto(url, { waitUntil: 'networkidle' });
    await browser.close();

    return apiData;
}

// Much faster than parsing the DOM - gets raw data directly
const data = await interceptApiCalls(
    'https://shop.example.com',
    '/api/products'
);

Concurrent scraping with worker threads

For high-volume extraction:

const { Worker, isMainThread, parentPort, workerData } = require('worker_threads');
const axios = require('axios');
const cheerio = require('cheerio');

// Worker logic
if (!isMainThread) {
    const { url } = workerData;

    axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' } })
        .then(({ data }) => {
            const $ = cheerio.load(data);
            const title = $('title').text();
            parentPort.postMessage({ url, title, success: true });
        })
        .catch(error => {
            parentPort.postMessage({ url, error: error.message, success: false });
        });
}

// Main thread
async function scrapeParallel(urls, concurrency = 5) {
    const results = [];

    // Process URLs in batches of `concurrency`
    for (let i = 0; i < urls.length; i += concurrency) {
        const batch = urls.slice(i, i + concurrency);

        const batchResults = await Promise.all(
            batch.map(url => new Promise((resolve) => {
                const worker = new Worker(__filename, { workerData: { url } });
                worker.on('message', resolve);
                worker.on('error', (err) => resolve({ url, error: err.message }));
            }))
        );

        results.push(...batchResults);
        console.log(`Processed ${Math.min(i + concurrency, urls.length)}/${urls.length} URLs`);
    }

    return results;
}

Full scraping framework: Crawlee

For production scraping projects, Crawlee (built by Apify) handles routing, error recovery, proxy rotation, and persistence:

const { CheerioCrawler, Dataset } = require('crawlee');

const crawler = new CheerioCrawler({
    maxConcurrency: 10,
    maxRetries: 3,

    async requestHandler({ $, request, enqueueLinks }) {
        // Extract data
        const title = $('title').text();
        const links = [];
        $('a[href]').each((_, el) => links.push($(el).attr('href')));

        // Save to dataset
        await Dataset.pushData({ url: request.url, title, linkCount: links.length });

        // Follow links (automatic deduplication)
        await enqueueLinks({ selector: 'a', baseUrl: request.loadedUrl });
    },

    failedRequestHandler({ request, error }) {
        console.error(`Request ${request.url} failed: ${error.message}`);
    }
});

await crawler.run(['https://example.com']);
const data = await Dataset.getData();
console.log(`Scraped ${data.items.length} pages`);

Node.js vs Python for scraping

Factor	Node.js	Python
Async performance	Excellent (event loop native)	Good (asyncio)
Library ecosystem	Crawlee, Puppeteer, Playwright	Scrapy, Playwright, requests
Data processing	Less mature	Excellent (pandas, numpy)
ML/AI integration	Limited	Excellent
Learning curve	Moderate	Low

Choose Node.js if your team is JavaScript-first or you're building a scraper as part of a larger Node.js service.

Choose Python if you need heavy data processing, ML integration, or are starting from scratch.

When to use neither

For common targets (Google, LinkedIn, Amazon, Instagram), use managed actors instead. The Apify Scrapers Bundle ($29) includes 35+ production actors that run in the cloud with proxy rotation built in — no Node.js or Python setup required.

n8n AI Automation Pack ($39) — 5 production-ready workflows

Production-Ready Scrapers

For scraping at scale without managing infrastructure:

DEV Community