Why Aggregate Product Reviews?
Product reviews are scattered across dozens of platforms — Trustpilot, G2, Amazon, Google, Yelp, Capterra. Building a review aggregator that pulls data from multiple sources gives you a unified view of customer sentiment. This is invaluable for:
- Brand monitoring — Track your reputation across platforms
- Competitive intelligence — Compare your reviews against competitors
- Product development — Identify recurring complaints and feature requests
- Market research — Understand category trends from review patterns
In this tutorial, we'll build a Python review aggregator that scrapes Trustpilot, G2, and product review pages.
Architecture
┌──────────────┐
│ Trustpilot │──┐
├──────────────┤ │ ┌────────────┐ ┌──────────┐ ┌──────────┐
│ G2 │──┼──▶│ Aggregator │──▶│ Analyzer │──▶│ Reporter │
├──────────────┤ │ └────────────┘ └──────────┘ └──────────┘
│ Amazon │──┘
└──────────────┘
The Review Data Model
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class Review:
source: str
rating: float
max_rating: float
title: str
body: str
author: str
date: Optional[datetime] = None
verified: bool = False
helpful_count: int = 0
@property
def normalized_rating(self) -> float:
"""Normalize all ratings to 0-1 scale"""
return self.rating / self.max_rating
@dataclass
class ProductProfile:
name: str
reviews: list = field(default_factory=list)
@property
def avg_rating(self):
if not self.reviews:
return 0
return sum(r.normalized_rating for r in self.reviews) / len(self.reviews)
@property
def review_count(self):
return len(self.reviews)
def by_source(self, source):
return [r for r in self.reviews if r.source == source]
Trustpilot Scraper
Trustpilot has relatively accessible public review pages:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
def scrape_trustpilot(company_slug, max_pages=5):
reviews = []
base_url = f'https://www.trustpilot.com/review/{company_slug}'
for page in range(1, max_pages + 1):
url = f'{base_url}?page={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
break
soup = BeautifulSoup(response.text, 'html.parser')
# Extract from JSON-LD structured data
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
try:
data = json.loads(script.string)
if isinstance(data, dict) and data.get('@type') == 'LocalBusiness':
for review_data in data.get('review', []):
reviews.append(Review(
source='trustpilot',
rating=float(review_data.get('reviewRating', {}).get('ratingValue', 0)),
max_rating=5.0,
title=review_data.get('headline', ''),
body=review_data.get('reviewBody', ''),
author=review_data.get('author', {}).get('name', 'Anonymous'),
date=datetime.fromisoformat(review_data['datePublished'])
if 'datePublished' in review_data else None,
))
except (json.JSONDecodeError, KeyError):
continue
# Fallback: parse HTML review cards
if not reviews:
cards = soup.select('[data-service-review-card-paper]')
for card in cards:
rating_el = card.select_one('[data-service-review-rating]')
title_el = card.select_one('[data-service-review-title-typography]')
body_el = card.select_one('[data-service-review-text-typography]')
author_el = card.select_one('[data-consumer-name-typography]')
if rating_el:
rating_val = int(rating_el.get('data-service-review-rating', 0))
reviews.append(Review(
source='trustpilot',
rating=float(rating_val),
max_rating=5.0,
title=title_el.get_text(strip=True) if title_el else '',
body=body_el.get_text(strip=True) if body_el else '',
author=author_el.get_text(strip=True) if author_el else 'Anonymous',
))
time.sleep(random.uniform(2, 4))
return reviews
G2 Reviews Scraper
def scrape_g2_reviews(product_slug, max_pages=3):
reviews = []
for page in range(1, max_pages + 1):
url = f'https://www.g2.com/products/{product_slug}/reviews?page={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
break
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.select('[itemprop="review"]')
for card in cards:
title = card.select_one('[itemprop="name"]')
body = card.select_one('[itemprop="reviewBody"]')
author = card.select_one('[itemprop="author"]')
date_el = card.select_one('time')
# Extract star rating from class names
rating_el = card.select_one('[class*="star-rating"]')
rating = 0
if rating_el:
classes = rating_el.get('class', [])
for cls in classes:
if 'stars-' in cls:
try:
rating = float(cls.split('stars-')[1]) / 2
except (ValueError, IndexError):
pass
reviews.append(Review(
source='g2',
rating=rating,
max_rating=5.0,
title=title.get_text(strip=True) if title else '',
body=body.get_text(strip=True) if body else '',
author=author.get_text(strip=True) if author else 'Anonymous',
date=datetime.fromisoformat(date_el.get('datetime')) if date_el else None,
))
time.sleep(random.uniform(3, 6))
return reviews
For production-grade G2 scraping, consider using a managed scraper that handles anti-bot challenges automatically.
The Aggregator
class ReviewAggregator:
def __init__(self):
self.scrapers = {
'trustpilot': scrape_trustpilot,
'g2': scrape_g2_reviews,
}
def aggregate(self, identifiers: dict) -> ProductProfile:
"""
identifiers: {'trustpilot': 'company-slug', 'g2': 'product-slug'}
"""
all_reviews = []
for source, slug in identifiers.items():
if source in self.scrapers:
try:
reviews = self.scrapers<a href="slug">source</a>
all_reviews.extend(reviews)
print(f'{source}: {len(reviews)} reviews collected')
except Exception as e:
print(f'{source} failed: {e}')
product = ProductProfile(
name=list(identifiers.values())[0],
reviews=all_reviews
)
return product
Sentiment Analysis
from collections import Counter
import re
def analyze_sentiment(product: ProductProfile):
# Rating distribution
rating_dist = Counter()
for review in product.reviews:
bucket = round(review.normalized_rating * 5)
rating_dist[bucket] += 1
# Common themes (simple keyword extraction)
positive_keywords = ['great', 'excellent', 'love', 'perfect', 'amazing', 'easy', 'fast']
negative_keywords = ['terrible', 'awful', 'slow', 'broken', 'worst', 'poor', 'bad']
pos_count = 0
neg_count = 0
for review in product.reviews:
text = (review.title + ' ' + review.body).lower()
pos_count += sum(1 for kw in positive_keywords if kw in text)
neg_count += sum(1 for kw in negative_keywords if kw in text)
return {
'total_reviews': product.review_count,
'avg_rating': round(product.avg_rating * 5, 2),
'rating_distribution': dict(rating_dist),
'positive_mentions': pos_count,
'negative_mentions': neg_count,
'sentiment_ratio': pos_count / max(neg_count, 1),
'by_source': {
source: {
'count': len(product.by_source(source)),
'avg': round(sum(r.normalized_rating for r in product.by_source(source)) / max(len(product.by_source(source)), 1) * 5, 2)
}
for source in set(r.source for r in product.reviews)
}
}
Generating Reports
import pandas as pd
def generate_report(product: ProductProfile, output='report.html'):
df = pd.DataFrame([
{
'source': r.source,
'rating': r.rating,
'normalized': round(r.normalized_rating, 2),
'title': r.title[:80],
'author': r.author,
'date': r.date,
}
for r in product.reviews
])
analysis = analyze_sentiment(product)
html = f'''
<html><head><title>Review Report: {product.name}</title></head>
<body>
<h1>{product.name} - Review Aggregation Report</h1>
<p>Total reviews: {analysis['total_reviews']}</p>
<p>Average rating: {analysis['avg_rating']}/5</p>
<p>Sentiment ratio: {analysis['sentiment_ratio']:.1f}</p>
{df.to_html(index=False)}
</body></html>
'''
with open(output, 'w') as f:
f.write(html)
print(f'Report saved to {output}')
# Usage
aggregator = ReviewAggregator()
product = aggregator.aggregate({
'trustpilot': 'example.com',
'g2': 'example-product',
})
generate_report(product)
Scaling Up
For production review aggregation across many products, you'll need proxy rotation to avoid blocks. ThorData provides rotating residential proxies that work reliably with review platforms that employ anti-bot measures.
Conclusion
A review aggregator combines multiple scraping techniques — JSON-LD extraction, HTML parsing, and API integration. The key challenges are normalizing ratings across platforms (G2's 10-point vs Trustpilot's 5-star) and handling anti-bot measures with proper proxy rotation via ThorData. Start with two sources, validate your data quality, then expand to more platforms.
Top comments (0)