Why Aggregate Product Reviews?
Product reviews are scattered across dozens of platforms — Trustpilot, G2, Amazon, Google, Yelp, Capterra. Building a review aggregator that pulls data from multiple sources gives you a unified view of customer sentiment. This is invaluable for:
- Brand monitoring — Track your reputation across platforms
- Competitive intelligence — Compare your reviews against competitors
- Product development — Identify recurring complaints and feature requests
- Market research — Understand category trends from review patterns
In this tutorial, we'll build a Python review aggregator that scrapes Trustpilot, G2, and product review pages.
Architecture
┌──────────────┐
│ Trustpilot │──┐
├──────────────┤ │ ┌────────────┐ ┌──────────┐ ┌──────────┐
│ G2 │──┼──▶│ Aggregator │──▶│ Analyzer │──▶│ Reporter │
├──────────────┤ │ └────────────┘ └──────────┘ └──────────┘
│ Amazon │──┘
└──────────────┘
The Review Data Model
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class Review:
source: str
rating: float
max_rating: float
title: str
body: str
author: str
date: Optional[datetime] = None
verified: bool = False
helpful_count: int = 0
@property
def normalized_rating(self) -> float:
"""Normalize all ratings to 0-1 scale"""
return self.rating / self.max_rating
@dataclass
class ProductProfile:
name: str
reviews: list = field(default_factory=list)
@property
def avg_rating(self):
if not self.reviews:
return 0
return sum(r.normalized_rating for r in self.reviews) / len(self.reviews)
@property
def review_count(self):
return len(self.reviews)
def by_source(self, source):
return [r for r in self.reviews if r.source == source]
Trustpilot Scraper
Trustpilot has relatively accessible public review pages:
# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).
G2 Reviews Scraper
# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).
For production-grade G2 scraping, consider using a managed scraper that handles anti-bot challenges automatically.
The Aggregator
class ReviewAggregator:
def __init__(self):
self.scrapers = {
'trustpilot': scrape_trustpilot,
'g2': scrape_g2_reviews,
}
def aggregate(self, identifiers: dict) -> ProductProfile:
"""
identifiers: {'trustpilot': 'company-slug', 'g2': 'product-slug'}
"""
all_reviews = []
for source, slug in identifiers.items():
if source in self.scrapers:
try:
reviews = self.scrapers<a href="slug">source</a>
all_reviews.extend(reviews)
print(f'{source}: {len(reviews)} reviews collected')
except Exception as e:
print(f'{source} failed: {e}')
product = ProductProfile(
name=list(identifiers.values())[0],
reviews=all_reviews
)
return product
Sentiment Analysis
from collections import Counter
import re
def analyze_sentiment(product: ProductProfile):
# Rating distribution
rating_dist = Counter()
for review in product.reviews:
bucket = round(review.normalized_rating * 5)
rating_dist[bucket] += 1
# Common themes (simple keyword extraction)
positive_keywords = ['great', 'excellent', 'love', 'perfect', 'amazing', 'easy', 'fast']
negative_keywords = ['terrible', 'awful', 'slow', 'broken', 'worst', 'poor', 'bad']
pos_count = 0
neg_count = 0
for review in product.reviews:
text = (review.title + ' ' + review.body).lower()
pos_count += sum(1 for kw in positive_keywords if kw in text)
neg_count += sum(1 for kw in negative_keywords if kw in text)
return {
'total_reviews': product.review_count,
'avg_rating': round(product.avg_rating * 5, 2),
'rating_distribution': dict(rating_dist),
'positive_mentions': pos_count,
'negative_mentions': neg_count,
'sentiment_ratio': pos_count / max(neg_count, 1),
'by_source': {
source: {
'count': len(product.by_source(source)),
'avg': round(sum(r.normalized_rating for r in product.by_source(source)) / max(len(product.by_source(source)), 1) * 5, 2)
}
for source in set(r.source for r in product.reviews)
}
}
Generating Reports
import pandas as pd
def generate_report(product: ProductProfile, output='report.html'):
df = pd.DataFrame([
{
'source': r.source,
'rating': r.rating,
'normalized': round(r.normalized_rating, 2),
'title': r.title[:80],
'author': r.author,
'date': r.date,
}
for r in product.reviews
])
analysis = analyze_sentiment(product)
html = f'''
<html><head><title>Review Report: {product.name}</title></head>
<body>
<h1>{product.name} - Review Aggregation Report</h1>
<p>Total reviews: {analysis['total_reviews']}</p>
<p>Average rating: {analysis['avg_rating']}/5</p>
<p>Sentiment ratio: {analysis['sentiment_ratio']:.1f}</p>
{df.to_html(index=False)}
</body></html>
'''
with open(output, 'w') as f:
f.write(html)
print(f'Report saved to {output}')
# Usage
aggregator = ReviewAggregator()
product = aggregator.aggregate({
'trustpilot': 'example.com',
'g2': 'example-product',
})
generate_report(product)
Scaling Up
For production review aggregation across many products, you'll need proxy rotation to avoid blocks. ThorData provides rotating residential proxies that work reliably with review platforms that employ anti-bot measures.
Conclusion
A review aggregator combines multiple scraping techniques — JSON-LD extraction, HTML parsing, and API integration. The key challenges are normalizing ratings across platforms (G2's 10-point vs Trustpilot's 5-star) and handling anti-bot measures with proper proxy rotation via ThorData. Start with two sources, validate your data quality, then expand to more platforms.
Top comments (0)