DEV Community

agenthustler
agenthustler

Posted on

Scraping App Permission Data: Privacy Analysis at Scale

Scraping App Permission Data: Privacy Analysis at Scale

Mobile apps request permissions that reveal their true data collection practices. Let's build a Python pipeline that scrapes app permission data and analyzes privacy patterns at scale.

Data Sources

  • Google Play Store — Detailed permission lists and data safety sections
  • Exodus Privacy — Android tracker and permission database (API available)
  • Apple App Store — Privacy nutrition labels

Setting Up

pip install requests beautifulsoup4 pandas matplotlib google-play-scraper
Enter fullscreen mode Exit fullscreen mode

Google Play Store Permissions

from google_play_scraper import app, permissions
import pandas as pd

def get_app_permissions(app_id):
    try:
        details = app(app_id)
        perms = permissions(app_id)
        return {
            "app_id": app_id,
            "title": details.get("title", ""),
            "developer": details.get("developer", ""),
            "installs": details.get("realInstalls", 0),
            "score": details.get("score", 0),
            "permissions": perms,
        }
    except Exception as e:
        print(f"Error fetching {app_id}: {e}")
        return None

apps_to_check = [
    "com.whatsapp", "org.telegram.messenger",
    "com.discord", "com.Slack", "com.facebook.orca"
]

results = []
for app_id in apps_to_check:
    data = get_app_permissions(app_id)
    if data:
        results.append(data)
        print(f"{data['title']}: {len(data['permissions'])} permission groups")
Enter fullscreen mode Exit fullscreen mode

Exodus Privacy API

import requests

EXODUS_API = "https://reports.exodus-privacy.eu.org/api"

def get_exodus_report(app_id):
    resp = requests.get(f"{EXODUS_API}/search/{app_id}/details")
    if resp.status_code == 200:
        data = resp.json()
        if data:
            latest = data[0] if isinstance(data, list) else data
            return {
                "app_id": app_id,
                "trackers": latest.get("trackers", []),
                "tracker_count": len(latest.get("trackers", [])),
                "permissions": latest.get("permissions", []),
                "permission_count": len(latest.get("permissions", []))
            }
    return None

for app_id in apps_to_check:
    report = get_exodus_report(app_id)
    if report:
        print(f"{app_id}: {report['tracker_count']} trackers, "
              f"{report['permission_count']} permissions")
Enter fullscreen mode Exit fullscreen mode

Permission Risk Scoring

HIGH_RISK_PERMISSIONS = {
    "android.permission.READ_CONTACTS": 8,
    "android.permission.ACCESS_FINE_LOCATION": 9,
    "android.permission.CAMERA": 7,
    "android.permission.RECORD_AUDIO": 8,
    "android.permission.READ_PHONE_STATE": 6,
    "android.permission.READ_SMS": 9,
    "android.permission.READ_CALL_LOG": 8,
    "android.permission.BODY_SENSORS": 7,
}

def calculate_privacy_score(permissions):
    risk_score = 0
    flagged = []
    for perm in permissions:
        if perm in HIGH_RISK_PERMISSIONS:
            risk_score += HIGH_RISK_PERMISSIONS[perm]
            flagged.append(perm.split(".")[-1])
    return {"risk_score": risk_score, "flagged_permissions": flagged,
            "total_permissions": len(permissions)}
Enter fullscreen mode Exit fullscreen mode

Comparative Analysis

import matplotlib.pyplot as plt

def compare_app_privacy(results):
    df = pd.DataFrame(results)
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    axes[0].barh(df["title"], df["permissions"].apply(len))
    axes[0].set_xlabel("Number of Permission Groups")
    axes[0].set_title("Permission Requests by App")

    risk_scores = []
    for _, row in df.iterrows():
        all_perms = []
        for group in row["permissions"]:
            all_perms.extend(group.get("permissions", []))
        score = calculate_privacy_score(all_perms)
        risk_scores.append(score["risk_score"])

    colors = ["#2ecc71" if s < 15 else "#f39c12" if s < 30 else "#e74c3c"
              for s in risk_scores]
    axes[1].barh(df["title"], risk_scores, color=colors)
    axes[1].set_xlabel("Privacy Risk Score")
    axes[1].set_title("Privacy Risk Assessment")

    plt.tight_layout()
    plt.savefig("privacy_comparison.png", dpi=150)

compare_app_privacy(results)
Enter fullscreen mode Exit fullscreen mode

Scraping iOS Privacy Labels

For Apple App Store privacy labels, use ScraperAPI with JS rendering:

def scrape_ios_privacy(app_url):
    params = {
        "api_key": "YOUR_SCRAPERAPI_KEY",
        "url": app_url,
        "render": "true"
    }
    resp = requests.get("https://api.scraperapi.com", params=params)
    soup = BeautifulSoup(resp.text, "html.parser")
    privacy_cards = soup.select(".app-privacy__card")
    labels = []
    for card in privacy_cards:
        category = card.select_one(".app-privacy__card-header")
        items = card.select(".app-privacy__list-item")
        if category:
            labels.append({
                "category": category.get_text(strip=True),
                "data_types": [i.get_text(strip=True) for i in items]
            })
    return labels
Enter fullscreen mode Exit fullscreen mode

Scale with ThorData proxies and monitor with ScrapeOps.

Key Takeaways

  • Google Play Scraper and Exodus API provide structured permission data
  • Risk scoring quantifies privacy invasiveness across apps
  • Comparative analysis reveals which apps over-collect data
  • Regular monitoring catches permission creep in updates

App store data is publicly visible. Use this analysis for privacy research and informed app choices.

Top comments (0)