Scraping App Reviews at Scale: Google Play + App Store Combined

#python #programming #tutorial #webdev

App reviews are a goldmine for product intelligence. Sentiment analysis, feature requests, bug reports, and competitive insights all live in review data. This tutorial shows you how to scrape reviews from both Google Play and the Apple App Store using Python.

Architecture Overview

We'll build a unified pipeline that:

Scrapes Google Play reviews via a dedicated library
Fetches App Store reviews via the RSS feed
Normalizes data into a common schema
Stores everything in a SQLite database

Setup

pip install requests beautifulsoup4 pandas google-play-scraper

The google-play-scraper library handles Google Play natively, but we'll use ScraperAPI for any pages that need proxy rotation.

Google Play Scraper

from google_play_scraper import Sort, reviews
import time

def scrape_google_play_reviews(app_id, count=500):
    """Scrape Google Play reviews for an app."""
    all_reviews = []
    result, token = reviews(
        app_id,
        lang="en",
        country="us",
        sort=Sort.NEWEST,
        count=min(count, 200)
    )
    all_reviews.extend(result)

    while len(all_reviews) < count and token:
        result, token = reviews(
            app_id,
            continuation_token=token
        )
        all_reviews.extend(result)
        time.sleep(2)

    return [{
        "source": "google_play",
        "app_id": app_id,
        "author": r.get("userName", ""),
        "rating": r.get("score", 0),
        "text": r.get("content", ""),
        "date": r.get("at", "").isoformat() if r.get("at") else "",
        "version": r.get("reviewCreatedVersion", ""),
        "thumbs_up": r.get("thumbsUpCount", 0)
    } for r in all_reviews[:count]]

App Store Scraper

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Unified Storage

import sqlite3

def init_review_db(path="app_reviews.db"):
    conn = sqlite3.connect(path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS reviews (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            app_id TEXT,
            author TEXT,
            rating INTEGER,
            text TEXT,
            date TEXT,
            version TEXT,
            thumbs_up INTEGER DEFAULT 0,
            UNIQUE(source, app_id, author, date)
        )
    """)
    conn.commit()
    return conn

def store_reviews(conn, review_list):
    new_count = 0
    for r in review_list:
        try:
            conn.execute(
                """INSERT INTO reviews
                (source, app_id, author, rating, text, date, version, thumbs_up)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
                (r["source"], r["app_id"], r["author"], r["rating"],
                 r["text"], r["date"], r["version"], r["thumbs_up"])
            )
            new_count += 1
        except sqlite3.IntegrityError:
            pass
    conn.commit()
    return new_count

Running the Full Pipeline

APPS = {
    "google_play": [
        "com.spotify.music",
        "com.instagram.android",
        "com.whatsapp",
    ],
    "app_store": [
        "324684580",   # Spotify
        "389801252",   # Instagram
        "310633997",   # WhatsApp
    ]
}

def run_review_pipeline():
    conn = init_review_db()

    for app_id in APPS["google_play"]:
        print(f"Scraping Google Play: {app_id}")
        result = scrape_google_play_reviews(app_id, count=300)
        new = store_reviews(conn, result)
        print(f"  Got {len(result)} reviews, {new} new")
        time.sleep(5)

    for app_id in APPS["app_store"]:
        print(f"Scraping App Store: {app_id}")
        result = scrape_app_store_reviews(app_id, pages=5)
        new = store_reviews(conn, result)
        print(f"  Got {len(result)} reviews, {new} new")
        time.sleep(5)

    conn.close()
    print("Pipeline complete!")

if __name__ == "__main__":
    run_review_pipeline()

Quick Analysis

import pandas as pd

def analyze_reviews(db_path="app_reviews.db"):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("SELECT * FROM reviews", conn)

    print("Reviews by source:")
    print(df.groupby("source")["rating"].agg(["count", "mean"]))

    print("\nRating distribution:")
    print(df["rating"].value_counts().sort_index())

    low_reviews = df[df["rating"] <= 2]["text"]
    print(f"\nLow-rated reviews: {len(low_reviews)}")

Scaling Tips

For scraping at production scale, ScraperAPI handles proxy rotation and rate limiting automatically. If you need residential IPs for region-specific stores, ThorData is a solid choice. Track your scraper health with ScrapeOps dashboards.

Conclusion

Combining Google Play and App Store reviews into a single pipeline gives you unmatched product intelligence. Run this weekly and you'll catch sentiment shifts, feature demands, and competitor weaknesses before anyone else.

DEV Community