Why Scrape Amazon?
Amazon product data powers price comparison tools, market research, competitor analysis, and review aggregation. With millions of products and constantly changing prices, automated collection is the only practical approach.
Let's build a Python scraper that extracts product listings, reviews, and pricing data from Amazon.
The Amazon Scraping Challenge
Amazon has some of the most aggressive anti-bot protection on the web:
- CAPTCHAs after a few requests
- Browser fingerprinting
- IP blocking within minutes
- Dynamic HTML that changes per region
A proxy service is practically mandatory. ScraperAPI has a dedicated Amazon endpoint that handles all of this.
Setting Up
pip install requests beautifulsoup4 pandas
Product Listing Scraper
import requests
from bs4 import BeautifulSoup
import json
import time
class AmazonScraper:
def __init__(self, api_key):
self.api_key = api_key
self.base = "https://api.scraperapi.com"
def fetch(self, url):
resp = requests.get(self.base, params={
"api_key": self.api_key,
"url": url,
"render": "true",
"country_code": "us"
})
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
def search_products(self, query, pages=1):
products = []
for page in range(1, pages + 1):
url = f"https://www.amazon.com/s?k={query}&page={page}"
soup = self.fetch(url)
products.extend(self._parse_search_results(soup))
time.sleep(3) # Respectful delay
return products
def _parse_search_results(self, soup):
results = []
for item in soup.select('[data-component-type="s-search-result"]'):
asin = item.get("data-asin", "")
if not asin:
continue
title_el = item.select_one("h2 a span")
price_whole = item.select_one(".a-price-whole")
price_frac = item.select_one(".a-price-fraction")
rating_el = item.select_one(".a-icon-star-small .a-icon-alt")
reviews_el = item.select_one('[data-csa-c-slot-id="alf-reviews"] span')
price = None
if price_whole:
whole = price_whole.get_text(strip=True).rstrip(".")
frac = price_frac.get_text(strip=True) if price_frac else "00"
price = float(f"{whole}.{frac}")
results.append({
"asin": asin,
"title": title_el.get_text(strip=True) if title_el else None,
"price": price,
"rating": rating_el.get_text(strip=True) if rating_el else None,
"review_count": reviews_el.get_text(strip=True) if reviews_el else None,
"url": f"https://www.amazon.com/dp/{asin}"
})
return results
Product Detail Scraper
def get_product_details(self, asin):
url = f"https://www.amazon.com/dp/{asin}"
soup = self.fetch(url)
details = {"asin": asin}
# Title
title = soup.select_one("#productTitle")
details["title"] = title.get_text(strip=True) if title else None
# Price
price = soup.select_one(".a-price .a-offscreen")
if price:
details["price"] = price.get_text(strip=True)
# Rating
rating = soup.select_one("#acrPopover .a-icon-alt")
details["rating"] = rating.get_text(strip=True) if rating else None
# Review count
reviews = soup.select_one("#acrCustomerReviewText")
details["review_count"] = reviews.get_text(strip=True) if reviews else None
# Features/bullets
bullets = soup.select("#feature-bullets li span")
details["features"] = [b.get_text(strip=True) for b in bullets]
# Availability
avail = soup.select_one("#availability span")
details["availability"] = avail.get_text(strip=True) if avail else None
# Best Seller Rank
bsr = soup.select_one("#SalesRank, #detailBulletsWrapper_feature_div")
if bsr:
details["bsr"] = bsr.get_text(strip=True)[:200]
return details
Review Scraper
def get_reviews(self, asin, pages=3):
reviews = []
for page in range(1, pages + 1):
url = f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page}&sortBy=recent"
soup = self.fetch(url)
reviews.extend(self._parse_reviews(soup))
time.sleep(3)
return reviews
def _parse_reviews(self, soup):
reviews = []
for review in soup.select('[data-hook="review"]'):
title = review.select_one('[data-hook="review-title"] span')
body = review.select_one('[data-hook="review-body"] span')
rating = review.select_one('[data-hook="review-star-rating"] .a-icon-alt')
date = review.select_one('[data-hook="review-date"]')
verified = review.select_one('[data-hook="avp-badge"]')
reviews.append({
"title": title.get_text(strip=True) if title else None,
"body": body.get_text(strip=True) if body else None,
"rating": rating.get_text(strip=True) if rating else None,
"date": date.get_text(strip=True) if date else None,
"verified": verified is not None
})
return reviews
Price Tracking
import sqlite3
from datetime import datetime
class PriceTracker:
def __init__(self, scraper):
self.scraper = scraper
self.db = sqlite3.connect("prices.db")
self.db.execute("""
CREATE TABLE IF NOT EXISTS prices (
asin TEXT,
price REAL,
timestamp TEXT
)
""")
def track(self, asins):
for asin in asins:
details = self.scraper.get_product_details(asin)
price_str = details.get("price", "")
if price_str:
price = float(price_str.replace("$", "").replace(",", ""))
self.db.execute(
"INSERT INTO prices VALUES (?, ?, ?)",
(asin, price, datetime.now().isoformat())
)
print(f"{asin}: ${price}")
time.sleep(2)
self.db.commit()
def get_history(self, asin):
cursor = self.db.execute(
"SELECT price, timestamp FROM prices WHERE asin = ? ORDER BY timestamp",
(asin,)
)
return cursor.fetchall()
Usage Example
scraper = AmazonScraper(api_key="YOUR_SCRAPERAPI_KEY")
# Search for products
products = scraper.search_products("wireless earbuds", pages=2)
print(f"Found {len(products)} products")
# Get details for top result
if products:
details = scraper.get_product_details(products[0]["asin"])
print(json.dumps(details, indent=2))
# Get reviews
reviews = scraper.get_reviews(products[0]["asin"], pages=2)
print(f"Got {len(reviews)} reviews")
Proxy Strategy
For large-scale Amazon scraping:
- ScraperAPI has a dedicated Amazon parser that returns structured JSON
- ThorData residential proxies work well for raw requests
- Monitor your success rates with ScrapeOps
Legal Notes
- Use scraped data for personal research and analysis
- Do not republish Amazon's copyrighted content
- Respect rate limits to avoid impacting their service
- Consider the Amazon Product Advertising API for affiliate use cases
Conclusion
Amazon scraping requires robust proxy handling and careful rate limiting, but the data is incredibly valuable. Whether you are tracking competitor prices, analyzing reviews for product research, or building a price comparison tool, these patterns will get you started.
Top comments (0)