Trustpilot Review Scraping: Extract Business Reviews at Scale with Python

#webdev #python #webscraping #tutorial

Trustpilot hosts over 200 million reviews for businesses worldwide. This data is invaluable for reputation monitoring, competitive analysis, and customer sentiment research. Here's how to extract Trustpilot reviews programmatically with Python.

Why Scrape Trustpilot?

Reputation monitoring: Track your brand's reviews automatically
Competitive analysis: Compare review scores across competitors
Sentiment analysis: Understand what customers love and hate
Market research: Evaluate business quality in any industry
Lead generation: Find businesses with poor reviews (opportunity!)

Extracting Business Reviews

import requests
from bs4 import BeautifulSoup
import json
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

def scrape_business_reviews(business_slug, pages=5):
    """Scrape reviews for a business on Trustpilot."""
    all_reviews = []

    for page in range(1, pages + 1):
        url = f"https://www.trustpilot.com/review/{business_slug}?page={page}"
        response = requests.get(url, headers=HEADERS)

        if response.status_code != 200:
            print(f"Error on page {page}: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Trustpilot uses JSON-LD structured data
        ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
        for script in ld_scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and data.get("@type") == "LocalBusiness":
                    reviews_data = data.get("review", [])
                    for review in reviews_data:
                        all_reviews.append({
                            "author": review.get("author", {}).get("name", ""),
                            "rating": review.get("reviewRating", {}).get("ratingValue"),
                            "date": review.get("datePublished", ""),
                            "title": review.get("headline", ""),
                            "body": review.get("reviewBody", ""),
                        })
            except json.JSONDecodeError:
                continue

        # Fallback: parse HTML directly
        if not all_reviews:
            review_cards = soup.select("[data-review-id]")
            for card in review_cards:
                rating_el = card.select_one("[data-service-review-rating]")
                title_el = card.select_one("[data-service-review-title-typography]")
                body_el = card.select_one("[data-service-review-text-typography]")
                author_el = card.select_one("[data-consumer-name-typography]")
                date_el = card.select_one("time")

                all_reviews.append({
                    "author": author_el.text.strip() if author_el else "",
                    "rating": int(rating_el["data-service-review-rating"]) if rating_el else None,
                    "date": date_el["datetime"] if date_el else "",
                    "title": title_el.text.strip() if title_el else "",
                    "body": body_el.text.strip() if body_el else "",
                })

        print(f"Page {page}: {len(all_reviews)} total reviews collected")
        time.sleep(2)

    return all_reviews

Getting Business Overview

def get_business_info(business_slug):
    """Extract business summary from Trustpilot."""
    url = f"https://www.trustpilot.com/review/{business_slug}"
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract from JSON-LD
    ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
    for script in ld_scripts:
        try:
            data = json.loads(script.string)
            if isinstance(data, dict) and "aggregateRating" in data:
                agg = data["aggregateRating"]
                return {
                    "name": data.get("name", ""),
                    "url": data.get("url", ""),
                    "rating": float(agg.get("ratingValue", 0)),
                    "review_count": int(agg.get("reviewCount", 0)),
                    "best_rating": agg.get("bestRating", 5),
                    "trustpilot_url": url,
                }
        except (json.JSONDecodeError, ValueError):
            continue

    return None

Comparing Competitors

def compare_businesses(slugs):
    """Compare Trustpilot ratings across multiple businesses."""
    results = []

    for slug in slugs:
        info = get_business_info(slug)
        if info:
            reviews = scrape_business_reviews(slug, pages=3)
            # Calculate distribution
            distribution = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
            for r in reviews:
                if r["rating"] in distribution:
                    distribution[r["rating"]] += 1

            info["rating_distribution"] = distribution
            info["recent_reviews"] = len(reviews)
            results.append(info)

        time.sleep(3)

    # Sort by rating
    return sorted(results, key=lambda x: x["rating"], reverse=True)


# Example: Compare cloud hosting providers
providers = ["aws.amazon.com", "cloud.google.com", "azure.microsoft.com"]
comparison = compare_businesses(providers)
for biz in comparison:
    print(f"{biz['name']}: {biz['rating']}/5 ({biz['review_count']} reviews)")

Sentiment Analysis

from collections import Counter

def analyze_review_sentiment(reviews):
    """Analyze sentiment patterns in reviews."""
    positive = [r for r in reviews if r.get("rating", 0) >= 4]
    negative = [r for r in reviews if r.get("rating", 0) <= 2]
    neutral = [r for r in reviews if r.get("rating", 0) == 3]

    # Extract common words from positive and negative reviews
    def get_common_words(review_list):
        all_text = " ".join(r.get("body", "").lower() for r in review_list)
        words = [w for w in all_text.split() if len(w) > 3]
        return Counter(words).most_common(15)

    return {
        "total_reviews": len(reviews),
        "positive_count": len(positive),
        "negative_count": len(negative),
        "neutral_count": len(neutral),
        "positive_pct": round(len(positive) / len(reviews) * 100, 1) if reviews else 0,
        "positive_keywords": get_common_words(positive),
        "negative_keywords": get_common_words(negative),
    }

Production-Ready Trustpilot Scraping

For monitoring multiple businesses or building a comprehensive review database, the Trustpilot Scraper on Apify handles all the complexity — dynamic rendering, pagination, rate limiting, and structured data extraction.

For bypassing Trustpilot's anti-scraping protections at scale, ScraperAPI provides managed proxy rotation and browser rendering.

Data Export

import csv

def export_reviews(reviews, filename="trustpilot_reviews.csv"):
    if not reviews:
        return

    keys = reviews[0].keys()
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(reviews)

    print(f"Exported {len(reviews)} reviews to {filename}")

Best Practices

Use JSON-LD first: Trustpilot embeds structured data — much more reliable than HTML parsing
Rate limit carefully: 2-3 seconds between pages
Use ScraperAPI for proxy rotation and rendering
Handle pagination: Trustpilot limits pages to 20 reviews each
Monitor for changes: Trustpilot frequently updates their frontend

Conclusion

Trustpilot review data is powerful for reputation monitoring, competitive analysis, and market research. Whether you build custom scraping logic or use the Trustpilot Scraper on Apify, the insights from review data can drive real business decisions.

Happy scraping!

DEV Community