Product reviews are scattered across Amazon, Best Buy, Walmart, and dozens of niche sites. Building an aggregator that pulls reviews from multiple sources gives you a comprehensive view of any product. Here's how to build one with Python.
Why Aggregate Reviews?
- Complete picture — no single site has all reviews
- Spot fake reviews — cross-reference sentiment across platforms
- Competitive analysis — compare products using aggregated ratings
- Market research — understand what customers love and hate
Architecture
Our aggregator will:
- Search for a product across multiple retail sites
- Extract reviews, ratings, and metadata
- Normalize data into a common format
- Analyze sentiment and generate insights
Setting Up
pip install requests beautifulsoup4 pandas textblob
Base Review Scraper
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Optional
from datetime import datetime
@dataclass
class Review:
product_name: str
reviewer: str
rating: float
title: str
text: str
date: Optional[str]
verified: bool
source: str
helpful_votes: int = 0
class ReviewScraper:
def __init__(self, api_key=None):
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
def fetch(self, url):
if self.api_key:
proxy_url = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true"
return self.session.get(proxy_url, timeout=60)
return self.session.get(url, timeout=30)
def scrape_reviews(self, product_url):
raise NotImplementedError
Amazon Review Scraper
class AmazonReviewScraper(ReviewScraper):
def scrape_reviews(self, product_url):
response = self.fetch(product_url)
soup = BeautifulSoup(response.text, "html.parser")
reviews = []
review_cards = soup.find_all("div", {"data-hook": "review"})
for card in review_cards:
rating_el = card.find("i", {"data-hook": "review-star-rating"})
title_el = card.find("a", {"data-hook": "review-title"})
body_el = card.find("span", {"data-hook": "review-body"})
author_el = card.find("span", class_="a-profile-name")
date_el = card.find("span", {"data-hook": "review-date"})
verified_el = card.find("span", {"data-hook": "avp-badge"})
helpful_el = card.find("span", {"data-hook": "helpful-vote-statement"})
rating = 0.0
if rating_el:
rating_text = rating_el.get_text()
try:
rating = float(rating_text.split()[0])
except (ValueError, IndexError):
pass
helpful = 0
if helpful_el:
text = helpful_el.get_text()
try:
helpful = int(text.split()[0].replace(",", ""))
except (ValueError, IndexError):
helpful = 1 if "one" in text.lower() else 0
reviews.append(Review(
product_name="",
reviewer=author_el.get_text(strip=True) if author_el else "Anonymous",
rating=rating,
title=title_el.get_text(strip=True) if title_el else "",
text=body_el.get_text(strip=True) if body_el else "",
date=date_el.get_text(strip=True) if date_el else None,
verified=verified_el is not None,
source="amazon",
helpful_votes=helpful
))
return reviews
Best Buy Review Scraper
class BestBuyReviewScraper(ReviewScraper):
def scrape_reviews(self, product_url):
response = self.fetch(product_url)
soup = BeautifulSoup(response.text, "html.parser")
reviews = []
review_items = soup.find_all("li", class_="review-item")
for item in review_items:
rating_el = item.find("div", class_="c-review-average")
title_el = item.find("h4", class_="review-title")
body_el = item.find("p", class_="pre-white-space")
author_el = item.find("span", class_="reviewer-name")
rating = 0.0
if rating_el:
try:
rating = float(rating_el.get_text(strip=True))
except ValueError:
pass
reviews.append(Review(
product_name="",
reviewer=author_el.get_text(strip=True) if author_el else "Anonymous",
rating=rating,
title=title_el.get_text(strip=True) if title_el else "",
text=body_el.get_text(strip=True) if body_el else "",
date=None,
verified=False,
source="bestbuy"
))
return reviews
The Aggregator
import time
import pandas as pd
from textblob import TextBlob
class ReviewAggregator:
def __init__(self, api_key):
self.scrapers = {
"amazon": AmazonReviewScraper(api_key),
"bestbuy": BestBuyReviewScraper(api_key)
}
self.all_reviews = []
def aggregate(self, urls):
for source, url in urls.items():
if source in self.scrapers:
try:
reviews = self.scrapers[source].scrape_reviews(url)
self.all_reviews.extend(reviews)
print(f"Fetched {len(reviews)} reviews from {source}")
time.sleep(3)
except Exception as e:
print(f"Error with {source}: {e}")
return self.all_reviews
def analyze_sentiment(self):
for review in self.all_reviews:
blob = TextBlob(review.text)
review.sentiment = blob.sentiment.polarity
positive = sum(1 for r in self.all_reviews if getattr(r, "sentiment", 0) > 0.1)
negative = sum(1 for r in self.all_reviews if getattr(r, "sentiment", 0) < -0.1)
neutral = len(self.all_reviews) - positive - negative
return {
"total": len(self.all_reviews),
"positive": positive,
"negative": negative,
"neutral": neutral,
"avg_rating": sum(r.rating for r in self.all_reviews) / len(self.all_reviews) if self.all_reviews else 0
}
def export(self, filename="reviews.csv"):
data = [vars(r) for r in self.all_reviews]
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
return df
agg = ReviewAggregator(api_key="YOUR_KEY")
urls = {
"amazon": "https://www.amazon.com/dp/B09V3KXJPB/",
"bestbuy": "https://www.bestbuy.com/site/reviews/6505727"
}
agg.aggregate(urls)
sentiment = agg.analyze_sentiment()
print(f"Average rating: {sentiment['avg_rating']:.1f}/5")
print(f"Positive: {sentiment['positive']}, Negative: {sentiment['negative']}")
Finding Common Themes
from collections import Counter
import re
def extract_themes(reviews, min_count=3):
words = []
stop_words = {"the", "a", "an", "is", "it", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "this", "that", "i", "my"}
for review in reviews:
text_words = re.findall(r"\b[a-z]+\b", review.text.lower())
words.extend(w for w in text_words if w not in stop_words and len(w) > 3)
common = Counter(words).most_common(20)
return [(word, count) for word, count in common if count >= min_count]
themes = extract_themes(agg.all_reviews)
for word, count in themes:
print(f"{word}: {count} mentions")
Proxy Strategy
Retail sites have strong anti-bot measures. Use ScraperAPI with JavaScript rendering for Amazon and Best Buy pages. For high-volume scraping across multiple stores, ThorData residential proxies ensure consistent access. Track your success rates with ScrapeOps to quickly identify when a scraper needs attention.
Conclusion
A review aggregator gives you a 360-degree view of product sentiment that no single platform provides. The key patterns — base scraper classes, common data models, and sentiment analysis — apply to any multi-source aggregation project. Start with two sources, validate your parsing, then expand to more.
Happy scraping!
Top comments (0)