agenthustler

Posted on Mar 26

How to Scrape Amazon Product Data: Listings, Reviews, and Prices

#python #tutorial #webdev #programming

Why Scrape Amazon?

Amazon product data powers price comparison tools, market research, competitor analysis, and review aggregation. With millions of products and constantly changing prices, automated collection is the only practical approach.

Let's build a Python scraper that extracts product listings, reviews, and pricing data from Amazon.

The Amazon Scraping Challenge

Amazon has some of the most aggressive anti-bot protection on the web:

CAPTCHAs after a few requests
Browser fingerprinting
IP blocking within minutes
Dynamic HTML that changes per region

A proxy service is practically mandatory. ScraperAPI has a dedicated Amazon endpoint that handles all of this.

Setting Up

pip install requests beautifulsoup4 pandas

Product Listing Scraper

import requests
from bs4 import BeautifulSoup
import json
import time

class AmazonScraper:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base = "https://api.scraperapi.com"

    def fetch(self, url):
        resp = requests.get(self.base, params={
            "api_key": self.api_key,
            "url": url,
            "render": "true",
            "country_code": "us"
        })
        resp.raise_for_status()
        return BeautifulSoup(resp.text, "html.parser")

    def search_products(self, query, pages=1):
        products = []
        for page in range(1, pages + 1):
            url = f"https://www.amazon.com/s?k={query}&page={page}"
            soup = self.fetch(url)
            products.extend(self._parse_search_results(soup))
            time.sleep(3)  # Respectful delay
        return products

    def _parse_search_results(self, soup):
        results = []
        for item in soup.select('[data-component-type="s-search-result"]'):
            asin = item.get("data-asin", "")
            if not asin:
                continue

            title_el = item.select_one("h2 a span")
            price_whole = item.select_one(".a-price-whole")
            price_frac = item.select_one(".a-price-fraction")
            rating_el = item.select_one(".a-icon-star-small .a-icon-alt")
            reviews_el = item.select_one('[data-csa-c-slot-id="alf-reviews"] span')

            price = None
            if price_whole:
                whole = price_whole.get_text(strip=True).rstrip(".")
                frac = price_frac.get_text(strip=True) if price_frac else "00"
                price = float(f"{whole}.{frac}")

            results.append({
                "asin": asin,
                "title": title_el.get_text(strip=True) if title_el else None,
                "price": price,
                "rating": rating_el.get_text(strip=True) if rating_el else None,
                "review_count": reviews_el.get_text(strip=True) if reviews_el else None,
                "url": f"https://www.amazon.com/dp/{asin}"
            })
        return results

Product Detail Scraper

    def get_product_details(self, asin):
        url = f"https://www.amazon.com/dp/{asin}"
        soup = self.fetch(url)

        details = {"asin": asin}

        # Title
        title = soup.select_one("#productTitle")
        details["title"] = title.get_text(strip=True) if title else None

        # Price
        price = soup.select_one(".a-price .a-offscreen")
        if price:
            details["price"] = price.get_text(strip=True)

        # Rating
        rating = soup.select_one("#acrPopover .a-icon-alt")
        details["rating"] = rating.get_text(strip=True) if rating else None

        # Review count
        reviews = soup.select_one("#acrCustomerReviewText")
        details["review_count"] = reviews.get_text(strip=True) if reviews else None

        # Features/bullets
        bullets = soup.select("#feature-bullets li span")
        details["features"] = [b.get_text(strip=True) for b in bullets]

        # Availability
        avail = soup.select_one("#availability span")
        details["availability"] = avail.get_text(strip=True) if avail else None

        # Best Seller Rank
        bsr = soup.select_one("#SalesRank, #detailBulletsWrapper_feature_div")
        if bsr:
            details["bsr"] = bsr.get_text(strip=True)[:200]

        return details

Review Scraper

    def get_reviews(self, asin, pages=3):
        reviews = []
        for page in range(1, pages + 1):
            url = f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page}&sortBy=recent"
            soup = self.fetch(url)
            reviews.extend(self._parse_reviews(soup))
            time.sleep(3)
        return reviews

    def _parse_reviews(self, soup):
        reviews = []
        for review in soup.select('[data-hook="review"]'):
            title = review.select_one('[data-hook="review-title"] span')
            body = review.select_one('[data-hook="review-body"] span')
            rating = review.select_one('[data-hook="review-star-rating"] .a-icon-alt')
            date = review.select_one('[data-hook="review-date"]')
            verified = review.select_one('[data-hook="avp-badge"]')

            reviews.append({
                "title": title.get_text(strip=True) if title else None,
                "body": body.get_text(strip=True) if body else None,
                "rating": rating.get_text(strip=True) if rating else None,
                "date": date.get_text(strip=True) if date else None,
                "verified": verified is not None
            })
        return reviews

Price Tracking

import sqlite3
from datetime import datetime

class PriceTracker:
    def __init__(self, scraper):
        self.scraper = scraper
        self.db = sqlite3.connect("prices.db")
        self.db.execute("""
            CREATE TABLE IF NOT EXISTS prices (
                asin TEXT,
                price REAL,
                timestamp TEXT
            )
        """)

    def track(self, asins):
        for asin in asins:
            details = self.scraper.get_product_details(asin)
            price_str = details.get("price", "")
            if price_str:
                price = float(price_str.replace("$", "").replace(",", ""))
                self.db.execute(
                    "INSERT INTO prices VALUES (?, ?, ?)",
                    (asin, price, datetime.now().isoformat())
                )
                print(f"{asin}: ${price}")
            time.sleep(2)
        self.db.commit()

    def get_history(self, asin):
        cursor = self.db.execute(
            "SELECT price, timestamp FROM prices WHERE asin = ? ORDER BY timestamp",
            (asin,)
        )
        return cursor.fetchall()

Usage Example

scraper = AmazonScraper(api_key="YOUR_SCRAPERAPI_KEY")

# Search for products
products = scraper.search_products("wireless earbuds", pages=2)
print(f"Found {len(products)} products")

# Get details for top result
if products:
    details = scraper.get_product_details(products[0]["asin"])
    print(json.dumps(details, indent=2))

    # Get reviews
    reviews = scraper.get_reviews(products[0]["asin"], pages=2)
    print(f"Got {len(reviews)} reviews")

Proxy Strategy

For large-scale Amazon scraping:

ScraperAPI has a dedicated Amazon parser that returns structured JSON
ThorData residential proxies work well for raw requests
Monitor your success rates with ScrapeOps

Legal Notes

Use scraped data for personal research and analysis
Do not republish Amazon's copyrighted content
Respect rate limits to avoid impacting their service
Consider the Amazon Product Advertising API for affiliate use cases

Conclusion

Amazon scraping requires robust proxy handling and careful rate limiting, but the data is incredibly valuable. Whether you are tracking competitor prices, analyzing reviews for product research, or building a price comparison tool, these patterns will get you started.

DEV Community