Web Scraping with Node.js: Cheerio, Puppeteer, and Playwright
Node.js has become a powerhouse for web scraping. This guide compares the three major tools — Cheerio, Puppeteer, and Playwright — with practical examples for each.
When to Use What
| Tool | Best For | Speed | JS Rendering |
|---|---|---|---|
| Cheerio | Static HTML parsing | Fastest | No |
| Puppeteer | Chrome automation | Medium | Yes |
| Playwright | Multi-browser testing | Medium | Yes |
Setup
npm init -y
npm install cheerio axios puppeteer playwright
Cheerio: Fast HTML Parsing
Cheerio is jQuery for the server. It parses static HTML without a browser.
const axios = require("axios");
const cheerio = require("cheerio");
async function scrapeWithCheerio(url) {
const { data } = await axios.get(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
});
const $ = cheerio.load(data);
const results = [];
$("article.post").each((i, el) => {
results.push({
title: $(el).find("h2").text().trim(),
link: $(el).find("a").attr("href"),
summary: $(el).find(".summary").text().trim(),
date: $(el).find("time").attr("datetime")
});
});
return results;
}
// Usage
const posts = await scrapeWithCheerio("https://example-blog.com");
console.log(`Found ${posts.length} posts`);
When Cheerio Falls Short
Cheerio cannot execute JavaScript. If the page loads content dynamically (SPAs, infinite scroll, lazy loading), you need a browser-based tool.
Puppeteer: Chrome Automation
Puppeteer controls a headless Chrome browser — perfect for JS-heavy sites.
const puppeteer = require("puppeteer");
async function scrapeWithPuppeteer(url) {
const browser = await puppeteer.launch({
headless: "new",
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
const page = await browser.newPage();
await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)");
await page.goto(url, { waitUntil: "networkidle0" });
// Wait for dynamic content to load
await page.waitForSelector(".product-card", { timeout: 10000 });
// Extract data from the rendered page
const products = await page.evaluate(() => {
const cards = document.querySelectorAll(".product-card");
return Array.from(cards).map(card => ({
name: card.querySelector(".title")?.textContent?.trim(),
price: card.querySelector(".price")?.textContent?.trim(),
rating: card.querySelector(".rating")?.textContent?.trim(),
image: card.querySelector("img")?.src
}));
});
await browser.close();
return products;
}
// Handle infinite scroll
async function scrapeInfiniteScroll(url, scrollCount = 5) {
const browser = await puppeteer.launch({ headless: "new" });
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
for (let i = 0; i < scrollCount; i++) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await new Promise(r => setTimeout(r, 2000));
}
const items = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".item")).map(el => ({
text: el.textContent.trim()
}));
});
await browser.close();
return items;
}
Playwright: Multi-Browser Power
Playwright supports Chrome, Firefox, and Safari with a cleaner API.
const { chromium } = require("playwright");
async function scrapeWithPlaywright(url) {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
});
const page = await context.newPage();
// Block images and CSS for faster scraping
await page.route("**/*.{png,jpg,jpeg,gif,css,woff2}", route => route.abort());
await page.goto(url, { waitUntil: "domcontentloaded" });
// Use locators (Playwright best practice)
const items = await page.locator(".search-result").evaluateAll(nodes =>
nodes.map(node => ({
title: node.querySelector("h3")?.textContent?.trim(),
price: node.querySelector(".price")?.textContent?.trim(),
url: node.querySelector("a")?.href
}))
);
await browser.close();
return items;
}
// Screenshot + scrape combo
async function scrapeAndScreenshot(url) {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url);
await page.screenshot({ path: "page.png", fullPage: true });
const title = await page.title();
await browser.close();
return { title, screenshot: "page.png" };
}
Building a Production Scraper
const axios = require("axios");
const cheerio = require("cheerio");
const { chromium } = require("playwright");
class SmartScraper {
constructor(proxyKey = null) {
this.proxyKey = proxyKey;
}
async scrape(url, options = {}) {
const { jsRendering = false, selector = "body" } = options;
if (jsRendering) {
return this.browserScrape(url, selector);
}
return this.staticScrape(url, selector);
}
async staticScrape(url, selector) {
const fetchUrl = this.proxyKey
? `http://api.scraperapi.com?api_key=${this.proxyKey}&url=${encodeURIComponent(url)}`
: url;
const { data } = await axios.get(fetchUrl);
const $ = cheerio.load(data);
return $(selector).html();
}
async browserScrape(url, selector) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle" });
const content = await page.locator(selector).innerHTML();
await browser.close();
return content;
}
}
Scaling Node.js Scrapers
For production scraping:
- ScraperAPI — proxy rotation and CAPTCHA solving, works with all three tools
- ThorData — residential proxies for sites that block datacenter IPs
- ScrapeOps — monitoring dashboard for your scraping pipeline
Performance Tips
- Use Cheerio when possible — 10x faster than browser-based scraping
- Block unnecessary resources in Puppeteer/Playwright
- Reuse browser instances instead of launching new ones
- Use connection pooling for concurrent requests
- Implement retry logic with exponential backoff
Conclusion
Pick the right tool for the job: Cheerio for static pages, Puppeteer for Chrome-specific needs, Playwright for multi-browser support. Combine them in a smart scraper that adapts to each target site.
Follow for more Node.js scraping tutorials!
Top comments (0)