DEV Community

agenthustler
agenthustler

Posted on

Scraping App Reviews at Scale: Google Play + App Store Combined

App reviews are a goldmine for product intelligence. Sentiment analysis, feature requests, bug reports, and competitive insights all live in review data. This tutorial shows you how to scrape reviews from both Google Play and the Apple App Store using Python.

Architecture Overview

We'll build a unified pipeline that:

  • Scrapes Google Play reviews via a dedicated library
  • Fetches App Store reviews via the RSS feed
  • Normalizes data into a common schema
  • Stores everything in a SQLite database

Setup

pip install requests beautifulsoup4 pandas google-play-scraper
Enter fullscreen mode Exit fullscreen mode

The google-play-scraper library handles Google Play natively, but we'll use ScraperAPI for any pages that need proxy rotation.

Google Play Scraper

from google_play_scraper import Sort, reviews
import time

def scrape_google_play_reviews(app_id, count=500):
    """Scrape Google Play reviews for an app."""
    all_reviews = []
    result, token = reviews(
        app_id,
        lang="en",
        country="us",
        sort=Sort.NEWEST,
        count=min(count, 200)
    )
    all_reviews.extend(result)

    while len(all_reviews) < count and token:
        result, token = reviews(
            app_id,
            continuation_token=token
        )
        all_reviews.extend(result)
        time.sleep(2)

    return [{
        "source": "google_play",
        "app_id": app_id,
        "author": r.get("userName", ""),
        "rating": r.get("score", 0),
        "text": r.get("content", ""),
        "date": r.get("at", "").isoformat() if r.get("at") else "",
        "version": r.get("reviewCreatedVersion", ""),
        "thumbs_up": r.get("thumbsUpCount", 0)
    } for r in all_reviews[:count]]
Enter fullscreen mode Exit fullscreen mode

App Store Scraper

import requests

def scrape_app_store_reviews(app_id, country="us", pages=5):
    """Scrape App Store reviews using the RSS feed endpoint."""
    all_reviews = []

    for page in range(1, pages + 1):
        rss_url = (
            f"https://itunes.apple.com/{country}/rss/"
            f"customerreviews/page={page}/id={app_id}"
            f"/sortby=mostrecent/json"
        )
        try:
            resp = requests.get(rss_url, timeout=15)
            data = resp.json()
            entries = data.get("feed", {}).get("entry", [])

            for entry in entries:
                if "im:rating" in entry:
                    all_reviews.append({
                        "source": "app_store",
                        "app_id": str(app_id),
                        "author": entry.get("author", {}).get("name", {}).get("label", ""),
                        "rating": int(entry["im:rating"]["label"]),
                        "text": entry.get("content", {}).get("label", ""),
                        "date": entry.get("updated", {}).get("label", ""),
                        "version": entry.get("im:version", {}).get("label", ""),
                        "thumbs_up": 0
                    })
        except Exception as e:
            print(f"RSS page {page} failed: {e}")
        time.sleep(2)

    return all_reviews
Enter fullscreen mode Exit fullscreen mode

Unified Storage

import sqlite3

def init_review_db(path="app_reviews.db"):
    conn = sqlite3.connect(path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS reviews (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            app_id TEXT,
            author TEXT,
            rating INTEGER,
            text TEXT,
            date TEXT,
            version TEXT,
            thumbs_up INTEGER DEFAULT 0,
            UNIQUE(source, app_id, author, date)
        )
    """)
    conn.commit()
    return conn

def store_reviews(conn, review_list):
    new_count = 0
    for r in review_list:
        try:
            conn.execute(
                """INSERT INTO reviews
                (source, app_id, author, rating, text, date, version, thumbs_up)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
                (r["source"], r["app_id"], r["author"], r["rating"],
                 r["text"], r["date"], r["version"], r["thumbs_up"])
            )
            new_count += 1
        except sqlite3.IntegrityError:
            pass
    conn.commit()
    return new_count
Enter fullscreen mode Exit fullscreen mode

Running the Full Pipeline

APPS = {
    "google_play": [
        "com.spotify.music",
        "com.instagram.android",
        "com.whatsapp",
    ],
    "app_store": [
        "324684580",   # Spotify
        "389801252",   # Instagram
        "310633997",   # WhatsApp
    ]
}

def run_review_pipeline():
    conn = init_review_db()

    for app_id in APPS["google_play"]:
        print(f"Scraping Google Play: {app_id}")
        result = scrape_google_play_reviews(app_id, count=300)
        new = store_reviews(conn, result)
        print(f"  Got {len(result)} reviews, {new} new")
        time.sleep(5)

    for app_id in APPS["app_store"]:
        print(f"Scraping App Store: {app_id}")
        result = scrape_app_store_reviews(app_id, pages=5)
        new = store_reviews(conn, result)
        print(f"  Got {len(result)} reviews, {new} new")
        time.sleep(5)

    conn.close()
    print("Pipeline complete!")

if __name__ == "__main__":
    run_review_pipeline()
Enter fullscreen mode Exit fullscreen mode

Quick Analysis

import pandas as pd

def analyze_reviews(db_path="app_reviews.db"):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("SELECT * FROM reviews", conn)

    print("Reviews by source:")
    print(df.groupby("source")["rating"].agg(["count", "mean"]))

    print("\nRating distribution:")
    print(df["rating"].value_counts().sort_index())

    low_reviews = df[df["rating"] <= 2]["text"]
    print(f"\nLow-rated reviews: {len(low_reviews)}")
Enter fullscreen mode Exit fullscreen mode

Scaling Tips

For scraping at production scale, ScraperAPI handles proxy rotation and rate limiting automatically. If you need residential IPs for region-specific stores, ThorData is a solid choice. Track your scraper health with ScrapeOps dashboards.

Conclusion

Combining Google Play and App Store reviews into a single pipeline gives you unmatched product intelligence. Run this weekly and you'll catch sentiment shifts, feature demands, and competitor weaknesses before anyone else.

Top comments (0)