Scraping Podcast Charts: Spotify, Apple, and Chartable Data

#python #tutorial #webdev #programming

Podcast analytics are notoriously opaque. Platforms guard their data closely. But by scraping public charts from Spotify, Apple Podcasts, and Chartable, you can build a comprehensive picture of what's trending in audio content.

What We'll Track

Top podcast rankings by category
Chart movement and velocity
Cross-platform performance comparisons
Genre and category trends

Setup

pip install requests beautifulsoup4 pandas matplotlib

Apple Podcasts Chart Scraper

Apple Podcasts exposes chart data via a public RSS-like endpoint:

import requests
import time
from datetime import datetime

def scrape_apple_podcast_charts(genre_id=26, country="us", limit=100):
    """Scrape Apple Podcasts top charts.

    Genre IDs: 26=All, 1301=Arts, 1303=Comedy, 1304=Education,
    1307=Health, 1309=TV, 1311=Music, 1314=Religion,
    1318=Technology, 1321=Business, 1324=Society, 1325=News
    """
    url = (
        f"https://rss.applemarketingtools.com/api/v2/{country}"
        f"/podcasts/top/{limit}/genre={genre_id}/json"
    )

    response = requests.get(url, timeout=15)
    data = response.json()

    podcasts = []
    for i, result in enumerate(data.get("feed", {}).get("results", []), 1):
        podcasts.append({
            "rank": i,
            "name": result.get("name", ""),
            "artist": result.get("artistName", ""),
            "apple_id": result.get("id", ""),
            "url": result.get("url", ""),
            "genre": (
                result.get("genres", [{}])[0].get("name", "")
                if result.get("genres") else ""
            ),
            "platform": "apple_podcasts",
            "country": country,
            "scraped_at": datetime.now().isoformat()
        })

    return podcasts

Spotify Podcast Charts

from bs4 import BeautifulSoup

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_spotify_podcast_charts(country="us"):
    """Scrape Spotify podcast charts page."""
    url = "https://podcastcharts.byspotify.com/"
    api_url = (
        f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}"
        f"&url={url}&render=true"
    )

    response = requests.get(api_url, timeout=60)
    soup = BeautifulSoup(response.text, "html.parser")

    podcasts = []
    for i, item in enumerate(soup.select(".chart-item, [data-testid='chart-row']"), 1):
        title = item.select_one(".show-name, .chart-title")
        publisher = item.select_one(".publisher, .chart-subtitle")

        if title:
            podcasts.append({
                "rank": i,
                "name": title.text.strip(),
                "artist": publisher.text.strip() if publisher else "",
                "platform": "spotify",
                "country": country,
                "scraped_at": datetime.now().isoformat()
            })

    return podcasts

Chartable Data

def scrape_chartable(category="overall", country="us"):
    """Scrape Chartable podcast rankings."""
    url = f"https://chartable.com/charts/itunes/{country}-{category}-podcasts"
    api_url = (
        f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}"
        f"&url={url}&render=true"
    )

    response = requests.get(api_url, timeout=60)
    soup = BeautifulSoup(response.text, "html.parser")

    podcasts = []
    for row in soup.select("table tbody tr"):
        cols = row.select("td")
        if len(cols) >= 3:
            rank_el = cols[0]
            name_el = cols[1].select_one("a")
            movement = cols[0].select_one(".movement")

            if name_el:
                podcasts.append({
                    "rank": (
                        int(rank_el.text.strip().split()[0])
                        if rank_el.text.strip() else 0
                    ),
                    "name": name_el.text.strip(),
                    "movement": movement.text.strip() if movement else "--",
                    "platform": "chartable",
                    "category": category,
                    "country": country,
                    "scraped_at": datetime.now().isoformat()
                })

    return podcasts

Cross-Platform Analysis

import pandas as pd

def collect_all_charts():
    """Collect charts from all platforms."""
    all_data = []

    genres = {26: "All", 1318: "Technology", 1321: "Business", 1325: "News"}
    for genre_id, genre_name in genres.items():
        print(f"Apple Podcasts: {genre_name}")
        data = scrape_apple_podcast_charts(genre_id=genre_id)
        all_data.extend(data)
        time.sleep(2)

    print("Spotify charts")
    spotify = scrape_spotify_podcast_charts()
    all_data.extend(spotify)
    time.sleep(3)

    for category in ["overall", "technology", "business"]:
        print(f"Chartable: {category}")
        chartable = scrape_chartable(category)
        all_data.extend(chartable)
        time.sleep(3)

    return all_data

def analyze_cross_platform(data):
    df = pd.DataFrame(data)

    print("=== Cross-Platform Podcast Rankings ===\n")

    name_counts = df.groupby("name")["platform"].nunique()
    multi_platform = name_counts[name_counts > 1].index

    print(f"Podcasts on multiple platforms: {len(multi_platform)}\n")

    for name in list(multi_platform)[:15]:
        entries = df[df["name"] == name][["platform", "rank"]].to_dict("records")
        platforms = ", ".join(f"{e['platform']} #{e['rank']}" for e in entries)
        print(f"  {name}: {platforms}")

    print("\n=== By Genre/Category ===\n")
    if "genre" in df.columns:
        genre_counts = df[df["genre"] != ""].groupby("genre").size()
        print(genre_counts.sort_values(ascending=False).head(10))

data = collect_all_charts()
analyze_cross_platform(data)

Tracking Changes Over Time

import sqlite3

def track_chart_movement(db_path="podcast_charts.db"):
    """Store and track chart positions over time."""
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS chart_history (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT, rank INTEGER, platform TEXT,
            category TEXT, country TEXT, scraped_at TEXT,
            UNIQUE(name, platform, category, scraped_at)
        )
    """)

    data = collect_all_charts()
    for item in data:
        try:
            conn.execute(
                """INSERT INTO chart_history
                (name, rank, platform, category, country, scraped_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
                (item["name"], item["rank"], item["platform"],
                 item.get("category", ""), item.get("country", "us"),
                 item["scraped_at"])
            )
        except sqlite3.IntegrityError:
            pass
    conn.commit()

    df = pd.read_sql("SELECT * FROM chart_history", conn)
    print(f"\nTotal chart entries tracked: {len(df)}")
    conn.close()

Proxy Strategy

Apple Podcasts RSS endpoints are open and don't need proxies. But Spotify and Chartable require help. ScraperAPI with JS rendering handles both platforms well. For geo-specific charts, ThorData residential proxies let you scrape from different countries. Monitor pipeline health with ScrapeOps.

Conclusion

Podcast chart data reveals cultural trends, advertising opportunities, and content strategy insights. With cross-platform tracking, you see the full picture that no single platform gives you on its own.

DEV Community