Can You Still Scrape Amazon Reviews in 2026? Yes — Here's How

#webdev #python #ecommerce

Can You Still Scrape Amazon Reviews in 2026? Yes — Here's How

Amazon changed how review data is accessed. Users are hitting 429s, CAPTCHA walls, and "request gated" error messages where they used to get clean JSON. Here's what changed and what actually works now.

What Changed in Amazon Reviews

Amazon's recent update added:

Session token requirement — you need an active session cookie before review API calls work
Rate limiting — stricter per-IP limits, especially on review endpoints
Progressive fingerprinting — the review pages now run more JS checks before returning data
"Request gated" error — some reviews return {"error": "RequestThrottled"} without a real CAPTCHA

The good news: the underlying review data is still accessible — Amazon still shows it to real users. You just need a different approach.

Method 1: The Structured Product Data Approach (Easiest, Still Works)

Amazon embeds product data including review summaries in the page's JSON-LD schema. No authentication needed:

import requests
from bs4 import BeautifulSoup
import json, re
from curl_cffi import requests as cf_requests

def get_amazon_product_reviews_summary(asin: str, retries: int = 3):
    """
    Extract review summary from Amazon product page.
    Uses curl_cffi to bypass TLS fingerprint detection.
    """
    url = f"https://www.amazon.com/dp/{asin}"

    session = cf_requests.Session()

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
    }

    for attempt in range(retries):
        response = session.get(url, impersonate="chrome124", headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract from JSON-LD structured data
            for script in soup.find_all('script', type='application/ld+json'):
                try:
                    data = json.loads(script.string)
                    if data.get('@type') == 'Product':
                        agg_rating = data.get('aggregateRating', {})
                        return {
                            'asin': asin,
                            'name': data.get('name', ''),
                            'average_rating': agg_rating.get('ratingValue'),
                            'review_count': agg_rating.get('reviewCount'),
                            'description': data.get('description', '')[:500]
                        }
                except (json.JSONDecodeError, AttributeError):
                    continue

            # Fallback: extract from HTML
            rating_elem = soup.find('span', {'data-hook': 'rating-out-of-text'})
            count_elem = soup.find('span', {'data-hook': 'total-review-count'})

            return {
                'asin': asin,
                'average_rating': rating_elem.text.strip() if rating_elem else None,
                'review_count': count_elem.text.strip() if count_elem else None,
            }

        elif response.status_code == 503:
            import time
            time.sleep(2 ** attempt * 3)

    return None

Method 2: Review Pages Directly (Full Review Text)

Amazon's review pages (/product-reviews/ASIN) are still scrapeable with the right setup:

import requests, time, random
from bs4 import BeautifulSoup
from curl_cffi import requests as cf_requests

PROXY_USER = "your_residential_proxy_user"
PROXY_PASS = "your_residential_proxy_pass"
PROXY_HOST = "your.proxy.host:8080"

def scrape_amazon_reviews(asin: str, pages: int = 5) -> list:
    """
    Scrape Amazon product reviews across multiple pages.
    Requires residential proxy for consistent success.
    """
    all_reviews = []

    session = cf_requests.Session()
    session.proxies = {
        "http": f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}",
        "https": f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}",
    }

    # Step 1: Warm up session on main product page
    product_url = f"https://www.amazon.com/dp/{asin}"
    warm_resp = session.get(product_url, impersonate="chrome124")
    time.sleep(random.uniform(2, 4))

    if warm_resp.status_code != 200:
        return []

    for page_num in range(1, pages + 1):
        review_url = f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page_num}"

        headers = {
            "Referer": product_url if page_num == 1 else f"https://www.amazon.com/product-reviews/{asin}?pageNumber={page_num-1}",
        }

        response = session.get(review_url, impersonate="chrome124", headers=headers)

        if response.status_code != 200:
            print(f"Page {page_num}: status {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = parse_reviews_from_page(soup)

        if not reviews:
            break  # No more reviews

        all_reviews.extend(reviews)

        # Human-like delay between pages
        time.sleep(random.uniform(3, 6))

    return all_reviews

def parse_reviews_from_page(soup: BeautifulSoup) -> list:
    reviews = []

    for review_div in soup.find_all('div', {'data-hook': 'review'}):
        try:
            review = {}

            # Rating
            rating_elem = review_div.find('span', class_='review-rating')
            if rating_elem:
                review['rating'] = float(rating_elem.text.split()[0])

            # Title
            title_elem = review_div.find('span', {'data-hook': 'review-title'})
            if title_elem:
                review['title'] = title_elem.text.strip()

            # Body
            body_elem = review_div.find('span', {'data-hook': 'review-body'})
            if body_elem:
                review['body'] = body_elem.text.strip()

            # Date
            date_elem = review_div.find('span', {'data-hook': 'review-date'})
            if date_elem:
                review['date'] = date_elem.text.strip()

            # Verified purchase
            verified_elem = review_div.find('span', {'data-hook': 'avp-badge'})
            review['verified_purchase'] = verified_elem is not None

            if review.get('body'):
                reviews.append(review)
        except AttributeError:
            continue

    return reviews

Method 3: Amazon API (Legitimate Route)

For price-sensitive projects, consider Amazon's official options:

Product Advertising API (PA API 5.0)

Includes CustomerReviews in some response types
Requires active Associate account with sales
Free but requires monthly traffic minimums

import boto3
from paapi5_python_sdk import DefaultApi, SearchItemsRequest, PartnerType

def get_reviews_via_api(asin: str, partner_tag: str, access_key: str, secret_key: str):
    """
    Requires: paapi5-python-sdk, active Amazon Associates account
    pip install paapi5-python-sdk
    """
    client = DefaultApi(
        access_key=access_key,
        secret_key=secret_key,
        host="webservices.amazon.com",
        region="us-east-1"
    )

    request = SearchItemsRequest(
        partner_tag=partner_tag,
        partner_type=PartnerType.ASSOCIATES,
        keywords=asin,
        resources=["CustomerReviews.Count", "CustomerReviews.StarRating"]
    )

    response = client.search_items(request)
    return response

Handling the "RequestThrottled" Error

If you're getting this error, it means you hit the rate limit before getting data:

import time, random

def amazon_request_with_backoff(session, url, max_retries=5):
    for attempt in range(max_retries):
        response = session.get(url, impersonate="chrome124")

        if response.status_code == 200:
            # Check for soft-block in response body
            if "RequestThrottled" in response.text or "Robot Check" in response.text:
                wait = (2 ** attempt) * 10 + random.uniform(0, 5)
                print(f"Soft blocked. Waiting {wait:.0f}s...")
                time.sleep(wait)
                continue
            return response

        elif response.status_code == 503:
            wait = (2 ** attempt) * 5
            time.sleep(wait)

        elif response.status_code == 429:
            retry_after = response.headers.get('Retry-After', 30)
            time.sleep(float(retry_after) + random.uniform(1, 5))

    raise Exception(f"Failed after {max_retries} attempts")

Rate Limits in Practice

From testing in April 2026:

Approach	Success Rate	Required
Direct requests (no proxy)	~15%	Nothing
curl_cffi Chrome impersonation	~45%	curl_cffi
curl_cffi + residential proxy	~80%	Proxy service
curl_cffi + residential + session warm-up	~92%	Proxy + patience
Rotating mobile proxies	~95%	Mobile proxy (expensive)

Legal Note

Amazon's Terms of Service restrict automated access. This guide is for educational purposes. For production use cases, use the official PA API or consider commercial data providers (Rainforest API, DataForSEO, BlueCart) which provide legal access to Amazon data at scale.