Can You Still Scrape Amazon Reviews in 2026? Yes — Here's How
Amazon changed how review data is accessed. Users are hitting 429s, CAPTCHA walls, and "request gated" error messages where they used to get clean JSON. Here's what changed and what actually works now.
What Changed in Amazon Reviews
Amazon's recent update added:
- Session token requirement — you need an active session cookie before review API calls work
- Rate limiting — stricter per-IP limits, especially on review endpoints
- Progressive fingerprinting — the review pages now run more JS checks before returning data
-
"Request gated" error — some reviews return
{"error": "RequestThrottled"}without a real CAPTCHA
The good news: the underlying review data is still accessible — Amazon still shows it to real users. You just need a different approach.
Method 1: The Structured Product Data Approach (Easiest, Still Works)
Amazon embeds product data including review summaries in the page's JSON-LD schema. No authentication needed:
import requests
from bs4 import BeautifulSoup
import json, re
from curl_cffi import requests as cf_requests
def get_amazon_product_reviews_summary(asin: str, retries: int = 3):
"""
Extract review summary from Amazon product page.
Uses curl_cffi to bypass TLS fingerprint detection.
"""
url = f"https://www.amazon.com/dp/{asin}"
session = cf_requests.Session()
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
}
for attempt in range(retries):
response = session.get(url, impersonate="chrome124", headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Extract from JSON-LD structured data
for script in soup.find_all('script', type='application/ld+json'):
try:
data = json.loads(script.string)
if data.get('@type') == 'Product':
agg_rating = data.get('aggregateRating', {})
return {
'asin': asin,
'name': data.get('name', ''),
'average_rating': agg_rating.get('ratingValue'),
'review_count': agg_rating.get('reviewCount'),
'description': data.get('description', '')[:500]
}
except (json.JSONDecodeError, AttributeError):
continue
# Fallback: extract from HTML
rating_elem = soup.find('span', {'data-hook': 'rating-out-of-text'})
count_elem = soup.find('span', {'data-hook': 'total-review-count'})
return {
'asin': asin,
'average_rating': rating_elem.text.strip() if rating_elem else None,
'review_count': count_elem.text.strip() if count_elem else None,
}
elif response.status_code == 503:
import time
time.sleep(2 ** attempt * 3)
return None
Method 2: Review Pages Directly (Full Review Text)
Amazon's review pages (/product-reviews/ASIN) are still scrapeable with the right setup:
import requests, time, random
from bs4 import BeautifulSoup
from curl_cffi import requests as cf_requests
PROXY_USER = "your_residential_proxy_user"
PROXY_PASS = "your_residential_proxy_pass"
PROXY_HOST = "your.proxy.host:8080"
def scrape_amazon_reviews(asin: str, pages: int = 5) -> list:
"""
Scrape Amazon product reviews across multiple pages.
Requires residential proxy for consistent success.
"""
all_reviews = []
session = cf_requests.Session()
session.proxies = {
"http": f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}",
"https": f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}",
}
# Step 1: Warm up session on main product page
product_url = f"https://www.amazon.com/dp/{asin}"
warm_resp = session.get(product_url, impersonate="chrome124")
time.sleep(random.uniform(2, 4))
if warm_resp.status_code != 200:
return []
for page_num in range(1, pages + 1):
review_url = f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page_num}"
headers = {
"Referer": product_url if page_num == 1 else f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page_num-1}",
}
response = session.get(review_url, impersonate="chrome124", headers=headers)
if response.status_code != 200:
print(f"Page {page_num}: status {response.status_code}")
break
soup = BeautifulSoup(response.text, 'html.parser')
reviews = parse_reviews_from_page(soup)
if not reviews:
break # No more reviews
all_reviews.extend(reviews)
# Human-like delay between pages
time.sleep(random.uniform(3, 6))
return all_reviews
def parse_reviews_from_page(soup: BeautifulSoup) -> list:
reviews = []
for review_div in soup.find_all('div', {'data-hook': 'review'}):
try:
review = {}
# Rating
rating_elem = review_div.find('span', class_='review-rating')
if rating_elem:
review['rating'] = float(rating_elem.text.split()[0])
# Title
title_elem = review_div.find('span', {'data-hook': 'review-title'})
if title_elem:
review['title'] = title_elem.text.strip()
# Body
body_elem = review_div.find('span', {'data-hook': 'review-body'})
if body_elem:
review['body'] = body_elem.text.strip()
# Date
date_elem = review_div.find('span', {'data-hook': 'review-date'})
if date_elem:
review['date'] = date_elem.text.strip()
# Verified purchase
verified_elem = review_div.find('span', {'data-hook': 'avp-badge'})
review['verified_purchase'] = verified_elem is not None
if review.get('body'):
reviews.append(review)
except AttributeError:
continue
return reviews
Method 3: Amazon API (Legitimate Route)
For price-sensitive projects, consider Amazon's official options:
Product Advertising API (PA API 5.0)
- Includes
CustomerReviewsin some response types - Requires active Associate account with sales
- Free but requires monthly traffic minimums
import boto3
from paapi5_python_sdk import DefaultApi, SearchItemsRequest, PartnerType
def get_reviews_via_api(asin: str, partner_tag: str, access_key: str, secret_key: str):
"""
Requires: paapi5-python-sdk, active Amazon Associates account
pip install paapi5-python-sdk
"""
client = DefaultApi(
access_key=access_key,
secret_key=secret_key,
host="webservices.amazon.com",
region="us-east-1"
)
request = SearchItemsRequest(
partner_tag=partner_tag,
partner_type=PartnerType.ASSOCIATES,
keywords=asin,
resources=["CustomerReviews.Count", "CustomerReviews.StarRating"]
)
response = client.search_items(request)
return response
Handling the "RequestThrottled" Error
If you're getting this error, it means you hit the rate limit before getting data:
import time, random
def amazon_request_with_backoff(session, url, max_retries=5):
for attempt in range(max_retries):
response = session.get(url, impersonate="chrome124")
if response.status_code == 200:
# Check for soft-block in response body
if "RequestThrottled" in response.text or "Robot Check" in response.text:
wait = (2 ** attempt) * 10 + random.uniform(0, 5)
print(f"Soft blocked. Waiting {wait:.0f}s...")
time.sleep(wait)
continue
return response
elif response.status_code == 503:
wait = (2 ** attempt) * 5
time.sleep(wait)
elif response.status_code == 429:
retry_after = response.headers.get('Retry-After', 30)
time.sleep(float(retry_after) + random.uniform(1, 5))
raise Exception(f"Failed after {max_retries} attempts")
Rate Limits in Practice
From testing in April 2026:
| Approach | Success Rate | Required |
|---|---|---|
| Direct requests (no proxy) | ~15% | Nothing |
| curl_cffi Chrome impersonation | ~45% | curl_cffi |
| curl_cffi + residential proxy | ~80% | Proxy service |
| curl_cffi + residential + session warm-up | ~92% | Proxy + patience |
| Rotating mobile proxies | ~95% | Mobile proxy (expensive) |
Legal Note
Amazon's Terms of Service restrict automated access. This guide is for educational purposes. For production use cases, use the official PA API or consider commercial data providers (Rainforest API, DataForSEO, BlueCart) which provide legal access to Amazon data at scale.
Related Articles
- Web Scraping Without Getting Banned in 2026 — Full anti-detection playbook
- curl_cffi Stopped Working? Here's What to Try Next — When TLS impersonation breaks
Take the next step
Skip the setup. Production-ready tools for Amazon review scraping:
Apify Scrapers Bundle — $29 one-time
Instant download. Documented. Ready to deploy.
Top comments (0)