DEV Community

agenthustler
agenthustler

Posted on

Building a Product Review Aggregator Across Multiple Sites

Product reviews are scattered across Amazon, Best Buy, Walmart, and dozens of niche sites. Building an aggregator that pulls reviews from multiple sources gives you a comprehensive view of any product. Here's how to build one with Python.

Why Aggregate Reviews?

  • Complete picture — no single site has all reviews
  • Spot fake reviews — cross-reference sentiment across platforms
  • Competitive analysis — compare products using aggregated ratings
  • Market research — understand what customers love and hate

Architecture

Our aggregator will:

  1. Search for a product across multiple retail sites
  2. Extract reviews, ratings, and metadata
  3. Normalize data into a common format
  4. Analyze sentiment and generate insights

Setting Up

pip install requests beautifulsoup4 pandas textblob
Enter fullscreen mode Exit fullscreen mode

Base Review Scraper

import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Optional
from datetime import datetime

@dataclass
class Review:
    product_name: str
    reviewer: str
    rating: float
    title: str
    text: str
    date: Optional[str]
    verified: bool
    source: str
    helpful_votes: int = 0

class ReviewScraper:
    def __init__(self, api_key=None):
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })

    def fetch(self, url):
        if self.api_key:
            proxy_url = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true"
            return self.session.get(proxy_url, timeout=60)
        return self.session.get(url, timeout=30)

    def scrape_reviews(self, product_url):
        raise NotImplementedError
Enter fullscreen mode Exit fullscreen mode

Amazon Review Scraper

class AmazonReviewScraper(ReviewScraper):
    def scrape_reviews(self, product_url):
        response = self.fetch(product_url)
        soup = BeautifulSoup(response.text, "html.parser")

        reviews = []
        review_cards = soup.find_all("div", {"data-hook": "review"})

        for card in review_cards:
            rating_el = card.find("i", {"data-hook": "review-star-rating"})
            title_el = card.find("a", {"data-hook": "review-title"})
            body_el = card.find("span", {"data-hook": "review-body"})
            author_el = card.find("span", class_="a-profile-name")
            date_el = card.find("span", {"data-hook": "review-date"})
            verified_el = card.find("span", {"data-hook": "avp-badge"})
            helpful_el = card.find("span", {"data-hook": "helpful-vote-statement"})

            rating = 0.0
            if rating_el:
                rating_text = rating_el.get_text()
                try:
                    rating = float(rating_text.split()[0])
                except (ValueError, IndexError):
                    pass

            helpful = 0
            if helpful_el:
                text = helpful_el.get_text()
                try:
                    helpful = int(text.split()[0].replace(",", ""))
                except (ValueError, IndexError):
                    helpful = 1 if "one" in text.lower() else 0

            reviews.append(Review(
                product_name="",
                reviewer=author_el.get_text(strip=True) if author_el else "Anonymous",
                rating=rating,
                title=title_el.get_text(strip=True) if title_el else "",
                text=body_el.get_text(strip=True) if body_el else "",
                date=date_el.get_text(strip=True) if date_el else None,
                verified=verified_el is not None,
                source="amazon",
                helpful_votes=helpful
            ))
        return reviews
Enter fullscreen mode Exit fullscreen mode

Best Buy Review Scraper

class BestBuyReviewScraper(ReviewScraper):
    def scrape_reviews(self, product_url):
        response = self.fetch(product_url)
        soup = BeautifulSoup(response.text, "html.parser")

        reviews = []
        review_items = soup.find_all("li", class_="review-item")

        for item in review_items:
            rating_el = item.find("div", class_="c-review-average")
            title_el = item.find("h4", class_="review-title")
            body_el = item.find("p", class_="pre-white-space")
            author_el = item.find("span", class_="reviewer-name")

            rating = 0.0
            if rating_el:
                try:
                    rating = float(rating_el.get_text(strip=True))
                except ValueError:
                    pass

            reviews.append(Review(
                product_name="",
                reviewer=author_el.get_text(strip=True) if author_el else "Anonymous",
                rating=rating,
                title=title_el.get_text(strip=True) if title_el else "",
                text=body_el.get_text(strip=True) if body_el else "",
                date=None,
                verified=False,
                source="bestbuy"
            ))
        return reviews
Enter fullscreen mode Exit fullscreen mode

The Aggregator

import time
import pandas as pd
from textblob import TextBlob

class ReviewAggregator:
    def __init__(self, api_key):
        self.scrapers = {
            "amazon": AmazonReviewScraper(api_key),
            "bestbuy": BestBuyReviewScraper(api_key)
        }
        self.all_reviews = []

    def aggregate(self, urls):
        for source, url in urls.items():
            if source in self.scrapers:
                try:
                    reviews = self.scrapers[source].scrape_reviews(url)
                    self.all_reviews.extend(reviews)
                    print(f"Fetched {len(reviews)} reviews from {source}")
                    time.sleep(3)
                except Exception as e:
                    print(f"Error with {source}: {e}")
        return self.all_reviews

    def analyze_sentiment(self):
        for review in self.all_reviews:
            blob = TextBlob(review.text)
            review.sentiment = blob.sentiment.polarity

        positive = sum(1 for r in self.all_reviews if getattr(r, "sentiment", 0) > 0.1)
        negative = sum(1 for r in self.all_reviews if getattr(r, "sentiment", 0) < -0.1)
        neutral = len(self.all_reviews) - positive - negative

        return {
            "total": len(self.all_reviews),
            "positive": positive,
            "negative": negative,
            "neutral": neutral,
            "avg_rating": sum(r.rating for r in self.all_reviews) / len(self.all_reviews) if self.all_reviews else 0
        }

    def export(self, filename="reviews.csv"):
        data = [vars(r) for r in self.all_reviews]
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        return df

agg = ReviewAggregator(api_key="YOUR_KEY")
urls = {
    "amazon": "https://www.amazon.com/dp/B09V3KXJPB/",
    "bestbuy": "https://www.bestbuy.com/site/reviews/6505727"
}
agg.aggregate(urls)
sentiment = agg.analyze_sentiment()
print(f"Average rating: {sentiment['avg_rating']:.1f}/5")
print(f"Positive: {sentiment['positive']}, Negative: {sentiment['negative']}")
Enter fullscreen mode Exit fullscreen mode

Finding Common Themes

from collections import Counter
import re

def extract_themes(reviews, min_count=3):
    words = []
    stop_words = {"the", "a", "an", "is", "it", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "this", "that", "i", "my"}

    for review in reviews:
        text_words = re.findall(r"\b[a-z]+\b", review.text.lower())
        words.extend(w for w in text_words if w not in stop_words and len(w) > 3)

    common = Counter(words).most_common(20)
    return [(word, count) for word, count in common if count >= min_count]

themes = extract_themes(agg.all_reviews)
for word, count in themes:
    print(f"{word}: {count} mentions")
Enter fullscreen mode Exit fullscreen mode

Proxy Strategy

Retail sites have strong anti-bot measures. Use ScraperAPI with JavaScript rendering for Amazon and Best Buy pages. For high-volume scraping across multiple stores, ThorData residential proxies ensure consistent access. Track your success rates with ScrapeOps to quickly identify when a scraper needs attention.

Conclusion

A review aggregator gives you a 360-degree view of product sentiment that no single platform provides. The key patterns — base scraper classes, common data models, and sentiment analysis — apply to any multi-source aggregation project. Start with two sources, validate your parsing, then expand to more.

Happy scraping!

Top comments (0)