Python dominates web scraping tutorials, but Node.js has a strong ecosystem too. If you're already building in JavaScript, you don't need to switch languages. Here's a practical guide to web scraping in Node.js in 2026.
The Node.js scraping stack
| Task | Library | Notes |
|---|---|---|
| HTTP requests |
axios or got
|
Simple static pages |
| HTML parsing | cheerio |
jQuery-style selectors |
| Browser automation |
playwright or puppeteer
|
Dynamic/JS-rendered pages |
| Scraping framework | crawlee |
Full-featured, async, anti-bot |
| Scheduling | node-cron |
Recurring scrapes |
Static HTML scraping with Axios + Cheerio
For pages that don't require JavaScript, this combination is fast and simple:
const axios = require('axios');
const cheerio = require('cheerio');
async function scrapeHackerNews() {
const { data } = await axios.get('https://news.ycombinator.com', {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
}
});
const $ = cheerio.load(data);
const stories = [];
// Cheerio uses jQuery-style selectors
$('.athing').each((index, element) => {
const titleEl = $(element).find('.titleline a').first();
const scoreEl = $(`#score_${$(element).attr('id')}`);
stories.push({
rank: index + 1,
title: titleEl.text(),
url: titleEl.attr('href'),
score: parseInt(scoreEl.text()) || 0,
});
});
return stories;
}
scrapeHackerNews().then(stories => {
console.log(`Found ${stories.length} stories`);
stories.slice(0, 5).forEach(s =>
console.log(`${s.rank}. ${s.title} (${s.score}p)`)
);
});
Handling pagination
const axios = require('axios');
const cheerio = require('cheerio');
async function scrapeAllPages(baseUrl, maxPages = 10) {
const results = [];
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
const url = `${baseUrl}?page=${currentPage}`;
try {
const { data } = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' },
timeout: 10000
});
const $ = cheerio.load(data);
// Extract items from current page
$('.item').each((_, el) => {
results.push({
title: $(el).find('.title').text().trim(),
link: $(el).find('a').attr('href'),
});
});
// Check if there's a next page
hasNextPage = $('.pagination .next').length > 0;
currentPage++;
// Polite delay between requests
await new Promise(r => setTimeout(r, 1000 + Math.random() * 1000));
} catch (error) {
console.error(`Error on page ${currentPage}: ${error.message}`);
break;
}
}
return results;
}
JavaScript-rendered pages with Playwright
For React, Vue, and other SPAs, you need a browser:
const { chromium } = require('playwright');
async function scrapeReactApp(url) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0'
});
const page = await context.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
// Wait for dynamic content
await page.waitForSelector('.product-card');
// Extract data after JS renders
const products = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.product-card')).map(card => ({
name: card.querySelector('.product-name')?.textContent,
price: card.querySelector('.price')?.textContent,
rating: card.querySelector('.rating')?.textContent,
}));
});
await browser.close();
return products;
}
scrapeReactApp('https://example.com/products')
.then(products => console.log(JSON.stringify(products, null, 2)));
Intercepting API calls (faster than DOM parsing)
Many React apps fetch data from APIs. Intercept those instead of scraping the DOM:
const { chromium } = require('playwright');
async function interceptApiCalls(url, apiEndpoint) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const apiData = [];
// Intercept network requests
page.on('response', async (response) => {
if (response.url().includes(apiEndpoint) && response.status() === 200) {
try {
const json = await response.json();
apiData.push(json);
} catch (e) {
// Response wasn't JSON
}
}
});
await page.goto(url, { waitUntil: 'networkidle' });
await browser.close();
return apiData;
}
// Much faster than parsing the DOM - gets raw data directly
const data = await interceptApiCalls(
'https://shop.example.com',
'/api/products'
);
Concurrent scraping with worker threads
For high-volume extraction:
const { Worker, isMainThread, parentPort, workerData } = require('worker_threads');
const axios = require('axios');
const cheerio = require('cheerio');
// Worker logic
if (!isMainThread) {
const { url } = workerData;
axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 Chrome/122.0.0.0' } })
.then(({ data }) => {
const $ = cheerio.load(data);
const title = $('title').text();
parentPort.postMessage({ url, title, success: true });
})
.catch(error => {
parentPort.postMessage({ url, error: error.message, success: false });
});
}
// Main thread
async function scrapeParallel(urls, concurrency = 5) {
const results = [];
// Process URLs in batches of `concurrency`
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.all(
batch.map(url => new Promise((resolve) => {
const worker = new Worker(__filename, { workerData: { url } });
worker.on('message', resolve);
worker.on('error', (err) => resolve({ url, error: err.message }));
}))
);
results.push(...batchResults);
console.log(`Processed ${Math.min(i + concurrency, urls.length)}/${urls.length} URLs`);
}
return results;
}
Full scraping framework: Crawlee
For production scraping projects, Crawlee (built by Apify) handles routing, error recovery, proxy rotation, and persistence:
const { CheerioCrawler, Dataset } = require('crawlee');
const crawler = new CheerioCrawler({
maxConcurrency: 10,
maxRetries: 3,
async requestHandler({ $, request, enqueueLinks }) {
// Extract data
const title = $('title').text();
const links = [];
$('a[href]').each((_, el) => links.push($(el).attr('href')));
// Save to dataset
await Dataset.pushData({ url: request.url, title, linkCount: links.length });
// Follow links (automatic deduplication)
await enqueueLinks({ selector: 'a', baseUrl: request.loadedUrl });
},
failedRequestHandler({ request, error }) {
console.error(`Request ${request.url} failed: ${error.message}`);
}
});
await crawler.run(['https://example.com']);
const data = await Dataset.getData();
console.log(`Scraped ${data.items.length} pages`);
Node.js vs Python for scraping
| Factor | Node.js | Python |
|---|---|---|
| Async performance | Excellent (event loop native) | Good (asyncio) |
| Library ecosystem | Crawlee, Puppeteer, Playwright | Scrapy, Playwright, requests |
| Data processing | Less mature | Excellent (pandas, numpy) |
| ML/AI integration | Limited | Excellent |
| Learning curve | Moderate | Low |
Choose Node.js if your team is JavaScript-first or you're building a scraper as part of a larger Node.js service.
Choose Python if you need heavy data processing, ML integration, or are starting from scratch.
When to use neither
For common targets (Google, LinkedIn, Amazon, Instagram), use managed actors instead. The Apify Scrapers Bundle ($29) includes 35+ production actors that run in the cloud with proxy rotation built in — no Node.js or Python setup required.
n8n AI Automation Pack ($39) — 5 production-ready workflows
Production-Ready Scrapers
For scraping at scale without managing infrastructure:
Top comments (0)