DEV Community

agenthustler
agenthustler

Posted on

Building a Product Review Aggregator with Web Scraping

Why Aggregate Product Reviews?

Product reviews are scattered across dozens of platforms — Trustpilot, G2, Amazon, Google, Yelp, Capterra. Building a review aggregator that pulls data from multiple sources gives you a unified view of customer sentiment. This is invaluable for:

  • Brand monitoring — Track your reputation across platforms
  • Competitive intelligence — Compare your reviews against competitors
  • Product development — Identify recurring complaints and feature requests
  • Market research — Understand category trends from review patterns

In this tutorial, we'll build a Python review aggregator that scrapes Trustpilot, G2, and product review pages.

Architecture

┌──────────────┐
│  Trustpilot   │──┐
├──────────────┤  │    ┌────────────┐    ┌──────────┐    ┌──────────┐
│     G2        │──┼──▶│ Aggregator │──▶│ Analyzer │──▶│ Reporter │
├──────────────┤  │    └────────────┘    └──────────┘    └──────────┘
│   Amazon      │──┘
└──────────────┘
Enter fullscreen mode Exit fullscreen mode

The Review Data Model

from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional

@dataclass
class Review:
    source: str
    rating: float
    max_rating: float
    title: str
    body: str
    author: str
    date: Optional[datetime] = None
    verified: bool = False
    helpful_count: int = 0

    @property
    def normalized_rating(self) -> float:
        """Normalize all ratings to 0-1 scale"""
        return self.rating / self.max_rating

@dataclass
class ProductProfile:
    name: str
    reviews: list = field(default_factory=list)

    @property
    def avg_rating(self):
        if not self.reviews:
            return 0
        return sum(r.normalized_rating for r in self.reviews) / len(self.reviews)

    @property
    def review_count(self):
        return len(self.reviews)

    def by_source(self, source):
        return [r for r in self.reviews if r.source == source]
Enter fullscreen mode Exit fullscreen mode

Trustpilot Scraper

Trustpilot has relatively accessible public review pages:

import requests
from bs4 import BeautifulSoup
import json
import time
import random

def scrape_trustpilot(company_slug, max_pages=5):
    reviews = []
    base_url = f'https://www.trustpilot.com/review/{company_slug}'

    for page in range(1, max_pages + 1):
        url = f'{base_url}?page={page}'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract from JSON-LD structured data
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and data.get('@type') == 'LocalBusiness':
                    for review_data in data.get('review', []):
                        reviews.append(Review(
                            source='trustpilot',
                            rating=float(review_data.get('reviewRating', {}).get('ratingValue', 0)),
                            max_rating=5.0,
                            title=review_data.get('headline', ''),
                            body=review_data.get('reviewBody', ''),
                            author=review_data.get('author', {}).get('name', 'Anonymous'),
                            date=datetime.fromisoformat(review_data['datePublished']) 
                                if 'datePublished' in review_data else None,
                        ))
            except (json.JSONDecodeError, KeyError):
                continue

        # Fallback: parse HTML review cards
        if not reviews:
            cards = soup.select('[data-service-review-card-paper]')
            for card in cards:
                rating_el = card.select_one('[data-service-review-rating]')
                title_el = card.select_one('[data-service-review-title-typography]')
                body_el = card.select_one('[data-service-review-text-typography]')
                author_el = card.select_one('[data-consumer-name-typography]')

                if rating_el:
                    rating_val = int(rating_el.get('data-service-review-rating', 0))
                    reviews.append(Review(
                        source='trustpilot',
                        rating=float(rating_val),
                        max_rating=5.0,
                        title=title_el.get_text(strip=True) if title_el else '',
                        body=body_el.get_text(strip=True) if body_el else '',
                        author=author_el.get_text(strip=True) if author_el else 'Anonymous',
                    ))

        time.sleep(random.uniform(2, 4))

    return reviews
Enter fullscreen mode Exit fullscreen mode

G2 Reviews Scraper

def scrape_g2_reviews(product_slug, max_pages=3):
    reviews = []

    for page in range(1, max_pages + 1):
        url = f'https://www.g2.com/products/{product_slug}/reviews?page={page}'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.select('[itemprop="review"]')

        for card in cards:
            title = card.select_one('[itemprop="name"]')
            body = card.select_one('[itemprop="reviewBody"]')
            author = card.select_one('[itemprop="author"]')
            date_el = card.select_one('time')

            # Extract star rating from class names
            rating_el = card.select_one('[class*="star-rating"]')
            rating = 0
            if rating_el:
                classes = rating_el.get('class', [])
                for cls in classes:
                    if 'stars-' in cls:
                        try:
                            rating = float(cls.split('stars-')[1]) / 2
                        except (ValueError, IndexError):
                            pass

            reviews.append(Review(
                source='g2',
                rating=rating,
                max_rating=5.0,
                title=title.get_text(strip=True) if title else '',
                body=body.get_text(strip=True) if body else '',
                author=author.get_text(strip=True) if author else 'Anonymous',
                date=datetime.fromisoformat(date_el.get('datetime')) if date_el else None,
            ))

        time.sleep(random.uniform(3, 6))

    return reviews
Enter fullscreen mode Exit fullscreen mode

For production-grade G2 scraping, consider using a managed scraper that handles anti-bot challenges automatically.

The Aggregator

class ReviewAggregator:
    def __init__(self):
        self.scrapers = {
            'trustpilot': scrape_trustpilot,
            'g2': scrape_g2_reviews,
        }

    def aggregate(self, identifiers: dict) -> ProductProfile:
        """
        identifiers: {'trustpilot': 'company-slug', 'g2': 'product-slug'}
        """
        all_reviews = []

        for source, slug in identifiers.items():
            if source in self.scrapers:
                try:
                    reviews = self.scrapers<a href="slug">source</a>
                    all_reviews.extend(reviews)
                    print(f'{source}: {len(reviews)} reviews collected')
                except Exception as e:
                    print(f'{source} failed: {e}')

        product = ProductProfile(
            name=list(identifiers.values())[0],
            reviews=all_reviews
        )

        return product
Enter fullscreen mode Exit fullscreen mode

Sentiment Analysis

from collections import Counter
import re

def analyze_sentiment(product: ProductProfile):
    # Rating distribution
    rating_dist = Counter()
    for review in product.reviews:
        bucket = round(review.normalized_rating * 5)
        rating_dist[bucket] += 1

    # Common themes (simple keyword extraction)
    positive_keywords = ['great', 'excellent', 'love', 'perfect', 'amazing', 'easy', 'fast']
    negative_keywords = ['terrible', 'awful', 'slow', 'broken', 'worst', 'poor', 'bad']

    pos_count = 0
    neg_count = 0

    for review in product.reviews:
        text = (review.title + ' ' + review.body).lower()
        pos_count += sum(1 for kw in positive_keywords if kw in text)
        neg_count += sum(1 for kw in negative_keywords if kw in text)

    return {
        'total_reviews': product.review_count,
        'avg_rating': round(product.avg_rating * 5, 2),
        'rating_distribution': dict(rating_dist),
        'positive_mentions': pos_count,
        'negative_mentions': neg_count,
        'sentiment_ratio': pos_count / max(neg_count, 1),
        'by_source': {
            source: {
                'count': len(product.by_source(source)),
                'avg': round(sum(r.normalized_rating for r in product.by_source(source)) / max(len(product.by_source(source)), 1) * 5, 2)
            }
            for source in set(r.source for r in product.reviews)
        }
    }
Enter fullscreen mode Exit fullscreen mode

Generating Reports

import pandas as pd

def generate_report(product: ProductProfile, output='report.html'):
    df = pd.DataFrame([
        {
            'source': r.source,
            'rating': r.rating,
            'normalized': round(r.normalized_rating, 2),
            'title': r.title[:80],
            'author': r.author,
            'date': r.date,
        }
        for r in product.reviews
    ])

    analysis = analyze_sentiment(product)

    html = f'''
    <html><head><title>Review Report: {product.name}</title></head>
    <body>
    <h1>{product.name} - Review Aggregation Report</h1>
    <p>Total reviews: {analysis['total_reviews']}</p>
    <p>Average rating: {analysis['avg_rating']}/5</p>
    <p>Sentiment ratio: {analysis['sentiment_ratio']:.1f}</p>
    {df.to_html(index=False)}
    </body></html>
    '''

    with open(output, 'w') as f:
        f.write(html)
    print(f'Report saved to {output}')

# Usage
aggregator = ReviewAggregator()
product = aggregator.aggregate({
    'trustpilot': 'example.com',
    'g2': 'example-product',
})
generate_report(product)
Enter fullscreen mode Exit fullscreen mode

Scaling Up

For production review aggregation across many products, you'll need proxy rotation to avoid blocks. ThorData provides rotating residential proxies that work reliably with review platforms that employ anti-bot measures.

Conclusion

A review aggregator combines multiple scraping techniques — JSON-LD extraction, HTML parsing, and API integration. The key challenges are normalizing ratings across platforms (G2's 10-point vs Trustpilot's 5-star) and handling anti-bot measures with proper proxy rotation via ThorData. Start with two sources, validate your data quality, then expand to more platforms.

Top comments (0)