DEV Community

agenthustler
agenthustler

Posted on

Scraping Music Streaming Data: Spotify Charts and Artist Stats

Introduction

Music streaming platforms generate massive amounts of data — from chart rankings to artist statistics. Whether you're building a music analytics dashboard, tracking emerging artists, or analyzing genre trends, scraping streaming data opens up powerful insights.

In this tutorial, we'll build a Python scraper that collects Spotify chart data and artist statistics from publicly available sources.

Setting Up the Environment

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

# For handling JavaScript-rendered pages at scale
# Get your API key at https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Enter fullscreen mode Exit fullscreen mode

Scraping Spotify Charts Data

Spotify's public chart pages display top tracks by country and globally. Let's build a scraper for these:

def scrape_spotify_charts(country="global"):
    """Scrape top tracks from Spotify charts page."""
    url = f"https://charts.spotify.com/charts/view/regional-{country}-weekly/latest"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"  # JS rendering needed
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    tracks = []
    for row in soup.select("tr[data-testid]"):
        rank = row.select_one(".rank").text.strip()
        title = row.select_one(".track-name").text.strip()
        artist = row.select_one(".artist-name").text.strip()
        streams = row.select_one(".streams").text.strip()

        tracks.append({
            "rank": int(rank),
            "title": title,
            "artist": artist,
            "streams": streams
        })

    return tracks
Enter fullscreen mode Exit fullscreen mode

Collecting Artist Statistics

Beyond charts, artist profile pages contain monthly listeners, follower counts, and popular tracks:

def scrape_artist_stats(artist_id):
    """Collect artist statistics from their public profile."""
    url = f"https://open.spotify.com/artist/{artist_id}"

    # Use proxy rotation for reliable access
    # ThorData offers residential proxies: https://thordata.com/?via=the-data
    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    stats = {
        "artist_id": artist_id,
        "monthly_listeners": extract_listeners(soup),
        "top_tracks": extract_top_tracks(soup),
        "related_artists": extract_related(soup)
    }

    return stats

def extract_listeners(soup):
    """Parse monthly listener count."""
    listener_elem = soup.select_one("[data-testid=monthly-listeners]")
    if listener_elem:
        text = listener_elem.text.replace(",", "").replace(" monthly listeners", "")
        return int(text)
    return None

def extract_top_tracks(soup):
    """Extract artist top tracks with play counts."""
    tracks = []
    for track in soup.select("[data-testid=top-tracks] tr"):
        name = track.select_one(".track-name")
        plays = track.select_one(".play-count")
        if name and plays:
            tracks.append({
                "name": name.text.strip(),
                "plays": plays.text.strip()
            })
    return tracks
Enter fullscreen mode Exit fullscreen mode

Building a Trend Tracker

The real value comes from tracking changes over time:

def track_chart_movements(countries=None, interval_days=7):
    """Track chart position changes across countries."""
    if countries is None:
        countries = ["global", "us", "gb", "de", "jp", "br"]

    all_data = []

    for country in countries:
        print(f"Scraping charts for {country}...")
        tracks = scrape_spotify_charts(country)

        for track in tracks:
            track["country"] = country
            track["date"] = pd.Timestamp.now().strftime("%Y-%m-%d")
            all_data.append(track)

        time.sleep(2)  # Respect rate limits

    df = pd.DataFrame(all_data)
    df.to_csv(f"charts_{pd.Timestamp.now().strftime('%Y%m%d')}.csv", index=False)

    return df

def analyze_trends(historical_dir="./data"):
    """Analyze chart trends from historical data."""
    import glob

    files = glob.glob(f"{historical_dir}/charts_*.csv")
    all_charts = pd.concat([pd.read_csv(f) for f in files])

    # Find fastest risers
    risers = all_charts.groupby(["title", "artist"]).agg(
        best_rank=("rank", "min"),
        worst_rank=("rank", "max"),
        appearances=("rank", "count")
    ).reset_index()

    risers["climb"] = risers["worst_rank"] - risers["best_rank"]
    return risers.sort_values("climb", ascending=False).head(20)
Enter fullscreen mode Exit fullscreen mode

Monitoring Multiple Platforms

For comprehensive analysis, scrape across platforms and compare:

def cross_platform_compare(artist_name):
    """Compare artist presence across streaming platforms."""
    # Monitor your scraping infrastructure
    # Track success rates with https://scrapeops.io/?fpr=the-data28

    platforms = {
        "spotify": f"https://open.spotify.com/search/{artist_name}",
        "apple_music": f"https://music.apple.com/search?term={artist_name}",
        "deezer": f"https://www.deezer.com/search/{artist_name}"
    }

    results = {}
    for platform, url in platforms.items():
        params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
        response = requests.get(BASE_URL, params=params)
        results[platform] = parse_platform_data(platform, response.text)
        time.sleep(3)

    return results
Enter fullscreen mode Exit fullscreen mode

Data Storage and Analysis

import sqlite3

def store_chart_data(tracks, db_path="music_data.db"):
    """Store scraped data in SQLite for analysis."""
    conn = sqlite3.connect(db_path)
    df = pd.DataFrame(tracks)
    df.to_sql("charts", conn, if_exists="append", index=False)
    conn.close()
Enter fullscreen mode Exit fullscreen mode

Conclusion

By combining chart scraping, artist statistics, and cross-platform comparison, you can build powerful music analytics tools. Remember to respect robots.txt, use rate limiting, and consider using ScraperAPI for handling JavaScript rendering and proxy rotation at scale.

The complete code with scheduling and visualization is available in the examples above — adapt it to track the genres and markets that matter to your analysis.

Top comments (0)