DEV Community

Vhub Systems
Vhub Systems

Posted on

Web Scraping With Node.js in 2026: Axios + Cheerio, Playwright, Crawlee

Python dominates web scraping tutorials, but Node.js has a strong ecosystem too. If you're already building in JavaScript, you don't need to switch languages. Here's a practical guide to web scraping in Node.js in 2026.

The Node.js scraping stack

Task Library Notes
HTTP requests axios or got Simple static pages
HTML parsing cheerio jQuery-style selectors
Browser automation playwright or puppeteer Dynamic/JS-rendered pages
Scraping framework crawlee Full-featured, async, anti-bot
Scheduling node-cron Recurring scrapes

Static HTML scraping with Axios + Cheerio

For pages that don't require JavaScript, this combination is fast and simple:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeHackerNews() {
    const { data } = await axios.get('https://news.ycombinator.com', {
        headers: {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
        }
    });

    const $ = cheerio.load(data);
    const stories = [];

    // Cheerio uses jQuery-style selectors
    $('.athing').each((index, element) => {
        const titleEl = $(element).find('.titleline a').first();
        const scoreEl = $(`#score_${$(element).attr('id')}`);

        stories.push({
            rank: index + 1,
            title: titleEl.text(),
            url: titleEl.attr('href'),
            score: parseInt(scoreEl.text()) || 0,
        });
    });

    return stories;
}

scrapeHackerNews().then(stories => {
    console.log(`Found ${stories.length} stories`);
    stories.slice(0, 5).forEach(s => 
        console.log(`${s.rank}. ${s.title} (${s.score}p)`)
    );
});
Enter fullscreen mode Exit fullscreen mode

Handling pagination

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeAllPages(baseUrl, maxPages = 10) {
    const results = [];
    let currentPage = 1;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
        const url = `${baseUrl}?page=${currentPage}`;

        try {
            const { data } = await axios.get(url, {
                headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' },
                timeout: 10000
            });

            const $ = cheerio.load(data);

            // Extract items from current page
            $('.item').each((_, el) => {
                results.push({
                    title: $(el).find('.title').text().trim(),
                    link: $(el).find('a').attr('href'),
                });
            });

            // Check if there's a next page
            hasNextPage = $('.pagination .next').length > 0;
            currentPage++;

            // Polite delay between requests
            await new Promise(r => setTimeout(r, 1000 + Math.random() * 1000));

        } catch (error) {
            console.error(`Error on page ${currentPage}: ${error.message}`);
            break;
        }
    }

    return results;
}
Enter fullscreen mode Exit fullscreen mode

JavaScript-rendered pages with Playwright

For React, Vue, and other SPAs, you need a browser:

const { chromium } = require('playwright');

async function scrapeReactApp(url) {
    const browser = await chromium.launch({ headless: true });
    const context = await browser.newContext({
        userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
    });
    const page = await context.newPage();

    await page.goto(url, { waitUntil: 'networkidle' });

    // Wait for dynamic content
    await page.waitForSelector('.product-card');

    // Extract data after JS renders
    const products = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.product-card')).map(card => ({
            name: card.querySelector('.product-name')?.textContent,
            price: card.querySelector('.price')?.textContent,
            rating: card.querySelector('.rating')?.textContent,
        }));
    });

    await browser.close();
    return products;
}

scrapeReactApp('https://example.com/products')
    .then(products => console.log(JSON.stringify(products, null, 2)));
Enter fullscreen mode Exit fullscreen mode

Intercepting API calls (faster than DOM parsing)

Many React apps fetch data from APIs. Intercept those instead of scraping the DOM:

const { chromium } = require('playwright');

async function interceptApiCalls(url, apiEndpoint) {
    const browser = await chromium.launch({ headless: true });
    const page = await browser.newPage();
    const apiData = [];

    // Intercept network requests
    page.on('response', async (response) => {
        if (response.url().includes(apiEndpoint) && response.status() === 200) {
            try {
                const json = await response.json();
                apiData.push(json);
            } catch (e) {
                // Response wasn't JSON
            }
        }
    });

    await page.goto(url, { waitUntil: 'networkidle' });
    await browser.close();

    return apiData;
}

// Much faster than parsing the DOM - gets raw data directly
const data = await interceptApiCalls(
    'https://shop.example.com',
    '/api/products'
);
Enter fullscreen mode Exit fullscreen mode

Concurrent scraping with worker threads

For high-volume extraction:

const { Worker, isMainThread, parentPort, workerData } = require('worker_threads');
const axios = require('axios');
const cheerio = require('cheerio');

// Worker logic
if (!isMainThread) {
    const { url } = workerData;

    axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' } })
        .then(({ data }) => {
            const $ = cheerio.load(data);
            const title = $('title').text();
            parentPort.postMessage({ url, title, success: true });
        })
        .catch(error => {
            parentPort.postMessage({ url, error: error.message, success: false });
        });
}

// Main thread
async function scrapeParallel(urls, concurrency = 5) {
    const results = [];

    // Process URLs in batches of `concurrency`
    for (let i = 0; i < urls.length; i += concurrency) {
        const batch = urls.slice(i, i + concurrency);

        const batchResults = await Promise.all(
            batch.map(url => new Promise((resolve) => {
                const worker = new Worker(__filename, { workerData: { url } });
                worker.on('message', resolve);
                worker.on('error', (err) => resolve({ url, error: err.message }));
            }))
        );

        results.push(...batchResults);
        console.log(`Processed ${Math.min(i + concurrency, urls.length)}/${urls.length} URLs`);
    }

    return results;
}
Enter fullscreen mode Exit fullscreen mode

Full scraping framework: Crawlee

For production scraping projects, Crawlee (built by Apify) handles routing, error recovery, proxy rotation, and persistence:

const { CheerioCrawler, Dataset } = require('crawlee');

const crawler = new CheerioCrawler({
    maxConcurrency: 10,
    maxRetries: 3,

    async requestHandler({ $, request, enqueueLinks }) {
        // Extract data
        const title = $('title').text();
        const links = [];
        $('a[href]').each((_, el) => links.push($(el).attr('href')));

        // Save to dataset
        await Dataset.pushData({ url: request.url, title, linkCount: links.length });

        // Follow links (automatic deduplication)
        await enqueueLinks({ selector: 'a', baseUrl: request.loadedUrl });
    },

    failedRequestHandler({ request, error }) {
        console.error(`Request ${request.url} failed: ${error.message}`);
    }
});

await crawler.run(['https://example.com']);
const data = await Dataset.getData();
console.log(`Scraped ${data.items.length} pages`);
Enter fullscreen mode Exit fullscreen mode

Node.js vs Python for scraping

Factor Node.js Python
Async performance Excellent (event loop native) Good (asyncio)
Library ecosystem Crawlee, Puppeteer, Playwright Scrapy, Playwright, requests
Data processing Less mature Excellent (pandas, numpy)
ML/AI integration Limited Excellent
Learning curve Moderate Low

Choose Node.js if your team is JavaScript-first or you're building a scraper as part of a larger Node.js service.

Choose Python if you need heavy data processing, ML integration, or are starting from scratch.

When to use neither

For common targets (Google, LinkedIn, Amazon, Instagram), use managed actors instead. The Apify Scrapers Bundle ($29) includes 35+ production actors that run in the cloud with proxy rotation built in — no Node.js or Python setup required.

n8n AI Automation Pack ($39) — 5 production-ready workflows

Production-Ready Scrapers

For scraping at scale without managing infrastructure:

Top comments (0)