App reviews are a goldmine for product intelligence. Sentiment analysis, feature requests, bug reports, and competitive insights all live in review data. This tutorial shows you how to scrape reviews from both Google Play and the Apple App Store using Python.
Architecture Overview
We'll build a unified pipeline that:
- Scrapes Google Play reviews via a dedicated library
- Fetches App Store reviews via the RSS feed
- Normalizes data into a common schema
- Stores everything in a SQLite database
Setup
pip install requests beautifulsoup4 pandas google-play-scraper
The google-play-scraper library handles Google Play natively, but we'll use ScraperAPI for any pages that need proxy rotation.
Google Play Scraper
from google_play_scraper import Sort, reviews
import time
def scrape_google_play_reviews(app_id, count=500):
"""Scrape Google Play reviews for an app."""
all_reviews = []
result, token = reviews(
app_id,
lang="en",
country="us",
sort=Sort.NEWEST,
count=min(count, 200)
)
all_reviews.extend(result)
while len(all_reviews) < count and token:
result, token = reviews(
app_id,
continuation_token=token
)
all_reviews.extend(result)
time.sleep(2)
return [{
"source": "google_play",
"app_id": app_id,
"author": r.get("userName", ""),
"rating": r.get("score", 0),
"text": r.get("content", ""),
"date": r.get("at", "").isoformat() if r.get("at") else "",
"version": r.get("reviewCreatedVersion", ""),
"thumbs_up": r.get("thumbsUpCount", 0)
} for r in all_reviews[:count]]
App Store Scraper
import requests
def scrape_app_store_reviews(app_id, country="us", pages=5):
"""Scrape App Store reviews using the RSS feed endpoint."""
all_reviews = []
for page in range(1, pages + 1):
rss_url = (
f"https://itunes.apple.com/{country}/rss/"
f"customerreviews/page={page}/id={app_id}"
f"/sortby=mostrecent/json"
)
try:
resp = requests.get(rss_url, timeout=15)
data = resp.json()
entries = data.get("feed", {}).get("entry", [])
for entry in entries:
if "im:rating" in entry:
all_reviews.append({
"source": "app_store",
"app_id": str(app_id),
"author": entry.get("author", {}).get("name", {}).get("label", ""),
"rating": int(entry["im:rating"]["label"]),
"text": entry.get("content", {}).get("label", ""),
"date": entry.get("updated", {}).get("label", ""),
"version": entry.get("im:version", {}).get("label", ""),
"thumbs_up": 0
})
except Exception as e:
print(f"RSS page {page} failed: {e}")
time.sleep(2)
return all_reviews
Unified Storage
import sqlite3
def init_review_db(path="app_reviews.db"):
conn = sqlite3.connect(path)
conn.execute("""
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT,
app_id TEXT,
author TEXT,
rating INTEGER,
text TEXT,
date TEXT,
version TEXT,
thumbs_up INTEGER DEFAULT 0,
UNIQUE(source, app_id, author, date)
)
""")
conn.commit()
return conn
def store_reviews(conn, review_list):
new_count = 0
for r in review_list:
try:
conn.execute(
"""INSERT INTO reviews
(source, app_id, author, rating, text, date, version, thumbs_up)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
(r["source"], r["app_id"], r["author"], r["rating"],
r["text"], r["date"], r["version"], r["thumbs_up"])
)
new_count += 1
except sqlite3.IntegrityError:
pass
conn.commit()
return new_count
Running the Full Pipeline
APPS = {
"google_play": [
"com.spotify.music",
"com.instagram.android",
"com.whatsapp",
],
"app_store": [
"324684580", # Spotify
"389801252", # Instagram
"310633997", # WhatsApp
]
}
def run_review_pipeline():
conn = init_review_db()
for app_id in APPS["google_play"]:
print(f"Scraping Google Play: {app_id}")
result = scrape_google_play_reviews(app_id, count=300)
new = store_reviews(conn, result)
print(f" Got {len(result)} reviews, {new} new")
time.sleep(5)
for app_id in APPS["app_store"]:
print(f"Scraping App Store: {app_id}")
result = scrape_app_store_reviews(app_id, pages=5)
new = store_reviews(conn, result)
print(f" Got {len(result)} reviews, {new} new")
time.sleep(5)
conn.close()
print("Pipeline complete!")
if __name__ == "__main__":
run_review_pipeline()
Quick Analysis
import pandas as pd
def analyze_reviews(db_path="app_reviews.db"):
conn = sqlite3.connect(db_path)
df = pd.read_sql("SELECT * FROM reviews", conn)
print("Reviews by source:")
print(df.groupby("source")["rating"].agg(["count", "mean"]))
print("\nRating distribution:")
print(df["rating"].value_counts().sort_index())
low_reviews = df[df["rating"] <= 2]["text"]
print(f"\nLow-rated reviews: {len(low_reviews)}")
Scaling Tips
For scraping at production scale, ScraperAPI handles proxy rotation and rate limiting automatically. If you need residential IPs for region-specific stores, ThorData is a solid choice. Track your scraper health with ScrapeOps dashboards.
Conclusion
Combining Google Play and App Store reviews into a single pipeline gives you unmatched product intelligence. Run this weekly and you'll catch sentiment shifts, feature demands, and competitor weaknesses before anyone else.
Top comments (0)