How I Finally Beat Amazonβs Bot Detection (and Built a Powerful Web Scraper That Works!
Once your scraper function (scrapeAmazonProductPage) is ready, the next step is to wrap it inside a simple Express.js API.
This allows you (or any client app) to send a request with a product URL and get structured data in return.
π¦ Step 1 β Install Dependencies
If you havenβt already:
npm install express puppeteer cheerio crawler
You should now have these main dependencies:
{
"cheerio": "^1.0.0-rc.12",
"crawler": "^1.5.0",
"puppeteer": "^16.2.0",
"express": "^4.19.2"
}
Make sure your Node.js version is 20 or above for optimal Puppeteer compatibility.
π§± Step 2 β Create Project Structure
Hereβs a suggested folder layout:
amazon-scraper/
βββ package.json
βββ server.js
βββ src/
βββ scraper/
β βββ amazon.js
βββ services/
βββ scrapping.js
server.js β entry point for Express
src/scraper/amazon.js β your scraper logic (the code you already have)
src/services/scrapping.js β optional, for error logging (you can mock this for now)
π§ Step 3 β Example Mock for Error Saver
Create a dummy service in src/services/scrapping.js:
// src/services/scrapping.js
async function saveScrappingErrors(errorObj) {
console.error("Scraping error:", errorObj);
}
module.exports = { saveScrappingErrors };
π§ Step 4 β The Scraper (amazon.js)
Use your scraper function exactly as before.
Letβs slightly clean it for API use and export it properly:
// src/scraper/amazon.js
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const Crawler = require('crawler');
const { saveScrappingErrors } = require('../services/scrapping');
const crawlPage = (url, browser) => {
return new Promise((resolve, reject) => {
const c = new Crawler({
maxConnections: 100000,
skipDuplicates: true,
callback: async (error, res, done) => {
if (error) return reject(error);
try {
const $ = cheerio.load(res.body);
if (!$('#histogramTable').length) return resolve(await crawlPage(url, browser));
const reviews = [];
const reviewElements = $('.a-section.review[data-hook="review"]');
const review_rating = $('[data-hook="average-star-rating"]').text().trim();
const review_count = $('[data-hook="total-review-count"]').text().trim().split(' ')[0];
const name = $('#productTitle').text().trim();
const description = $('#feature-bullets .a-list-item').text().trim();
const product_author = $('#bylineInfo').text().trim();
const regex = /\b\d+(\.\d+)?\b/;
reviewElements.each((_, el) => {
const author = $(el).find('.a-profile-name').text().trim();
const content = $(el).find('.review-text').text().trim();
const title = $(el).find('[data-hook="review-title"]').text().trim();
const date = $(el).find('[data-hook="review-date"]').text().trim();
let stars = $(el).find('.review-rating span').text().trim();
const match = stars.match(regex);
stars = match ? parseFloat(match[0]) : '';
reviews.push({ author, content, title, date, rating: stars });
});
const extractStars = () => {
const starsPercentageArray = [];
$('#histogramTable .a-histogram-row').each((_, el) => {
const percentageText = $(el).find('.a-text-right a').text();
const percentage = parseInt(percentageText.replace('%', ''), 10);
const starsText = $(el).find('a.a-size-base').text();
const number_of_stars = parseInt(starsText, 10);
starsPercentageArray.push({ percentage: percentage || 0, number_of_stars });
});
return starsPercentageArray;
};
const extractMainImage = () => $('#imgTagWrapperId img').attr('src') || '';
const core_price = $('#corePriceDisplay_desktop_feature_div .a-section .aok-offscreen').text().trim();
const currencyPattern = /\$\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?/;
const match = core_price.match(currencyPattern);
const extractedCurrency = match ? match[0] : "";
const extractImages = async () => {
const htmlContent = res.body;
const page = await browser.newPage();
await page.setContent(htmlContent, { waitUntil: 'load', timeout: 0 });
const thumbnails = await page.$$('#altImages ul .imageThumbnail');
for (const thumbnail of thumbnails) {
await page.evaluate(el => el instanceof HTMLElement && el.scrollIntoView(), thumbnail);
await thumbnail.hover();
}
await page.waitForTimeout(1000);
const productData = await page.evaluate(() => {
const images = [];
document.querySelectorAll('.a-unordered-list .image .imgTagWrapper img').forEach(img => {
if (img && img.src && !img.src.endsWith('.svg')) images.push(img.src);
});
return images;
});
return productData;
};
const images_data = await extractImages();
resolve({
websiteName: 'Amazon',
reviews,
product_images_links: images_data,
review_rating,
review_count,
price: extractedCurrency,
name,
description,
product_author,
stars: extractStars(),
image_url: extractMainImage(),
});
} catch (err) {
reject(err);
} finally {
done();
}
},
});
c.queue(url);
});
};
async function scrapeAmazonProductPage(homeUrl) {
const browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true,
args: [
"--disable-gpu",
"--disable-dev-shm-usage",
"--disable-setuid-sandbox",
"--no-sandbox",
],
});
try {
const data = await crawlPage(homeUrl, browser);
return data;
} catch (e) {
await saveScrappingErrors({ error: e.message || e, url: homeUrl });
return null;
} finally {
await browser.close();
}
}
module.exports = { scrapeAmazonProductPage };
β‘ Step 5 β Create Express API
Now create server.js in the root:
// server.js
const express = require('express');
const cors = require('cors');
const { scrapeAmazonProductPage } = require('./src/scraper/amazon');
const app = express();
app.use(express.json());
app.use(cors());
// Health check
app.get('/', (req, res) => {
res.send('β
Amazon Scraper API is running...');
});
// Main API endpoint
app.post('/api/scrape', async (req, res) => {
const { url } = req.body;
if (!url || !url.includes('amazon')) {
return res.status(400).json({ error: 'Invalid or missing Amazon URL' });
}
try {
const data = await scrapeAmazonProductPage(url);
if (!data) {
return res.status(500).json({ error: 'Failed to scrape product data' });
}
res.json(data);
} catch (error) {
console.error('Scrape failed:', error);
res.status(500).json({ error: error.message || 'Unexpected error' });
}
});
const PORT = process.env.PORT || 4000;
app.listen(PORT, () => console.log(`π Server running on port ${PORT}`));
π§ͺ Step 6 β Test the API
Run the server:
node server.js
Then use Postman, curl, or any HTTP client to test:
Request:
POST http://localhost:4000/api/scrape
Content-Type: application/json
{
"url": "https://www.amazon.com/dp/B0BP9Z7K5V"
}
Response:
{
"websiteName": "Amazon",
"name": "Apple AirPods (3rd Generation)",
"price": "$169.99",
"review_rating": "4.7 out of 5 stars",
"review_count": "145,201",
"description": "Spatial Audio with dynamic head tracking...",
"product_author": "Apple",
"stars": [
{ "number_of_stars": 5, "percentage": 85 },
{ "number_of_stars": 4, "percentage": 10 }
],
"product_images_links": [
"https://m.media-amazon.com/images/I/61ZRU9gnbxL._AC_SL1500_.jpg",
"https://m.media-amazon.com/images/I/61dw1VHfwbL._AC_SL1500_.jpg"
]
}
βοΈ Step 7 β Tips for Production
β
Use rate limiting to avoid Amazon blocking.
β
Deploy behind a proxy or rotating IP system if scraping frequently.
β
Consider puppeteer-extra-plugin-stealth for better evasion.
β
Cache results in a database if youβll reuse them often.
Top comments (1)
If you're thinking about taking this to production, there's definitely some stuff worth leveling up on: throw in some retry logic with exponential backoff to handle those flaky network hiccups, definitely pool your browser instances instead of spinning up a new one every time -that's gonna drain your wallet, and swap out console.log() for an actual logger that's built for the job. Oh, and heads up - the npm install is missing the cors package, so don't forget to grab that. Also, that URL validation checking for 'amazon" in the string is pretty loose, you'd want something more legit. Real talk tho: scraping Amazon might step on their Terms of Service, so it's worth checking out their official APIs or at least rotating your User-Agents to stay in their good graces
Some comments may only be visible to logged-in visitors. Sign in to view all comments.