Introduction
Music streaming platforms generate massive amounts of data — from chart rankings to artist statistics. Whether you're building a music analytics dashboard, tracking emerging artists, or analyzing genre trends, scraping streaming data opens up powerful insights.
In this tutorial, we'll build a Python scraper that collects Spotify chart data and artist statistics from publicly available sources.
Setting Up the Environment
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
# For handling JavaScript-rendered pages at scale
# Get your API key at https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Scraping Spotify Charts Data
Spotify's public chart pages display top tracks by country and globally. Let's build a scraper for these:
def scrape_spotify_charts(country="global"):
"""Scrape top tracks from Spotify charts page."""
url = f"https://charts.spotify.com/charts/view/regional-{country}-weekly/latest"
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true" # JS rendering needed
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
tracks = []
for row in soup.select("tr[data-testid]"):
rank = row.select_one(".rank").text.strip()
title = row.select_one(".track-name").text.strip()
artist = row.select_one(".artist-name").text.strip()
streams = row.select_one(".streams").text.strip()
tracks.append({
"rank": int(rank),
"title": title,
"artist": artist,
"streams": streams
})
return tracks
Collecting Artist Statistics
Beyond charts, artist profile pages contain monthly listeners, follower counts, and popular tracks:
def scrape_artist_stats(artist_id):
"""Collect artist statistics from their public profile."""
url = f"https://open.spotify.com/artist/{artist_id}"
# Use proxy rotation for reliable access
# ThorData offers residential proxies: https://thordata.com/?via=the-data
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
stats = {
"artist_id": artist_id,
"monthly_listeners": extract_listeners(soup),
"top_tracks": extract_top_tracks(soup),
"related_artists": extract_related(soup)
}
return stats
def extract_listeners(soup):
"""Parse monthly listener count."""
listener_elem = soup.select_one("[data-testid=monthly-listeners]")
if listener_elem:
text = listener_elem.text.replace(",", "").replace(" monthly listeners", "")
return int(text)
return None
def extract_top_tracks(soup):
"""Extract artist top tracks with play counts."""
tracks = []
for track in soup.select("[data-testid=top-tracks] tr"):
name = track.select_one(".track-name")
plays = track.select_one(".play-count")
if name and plays:
tracks.append({
"name": name.text.strip(),
"plays": plays.text.strip()
})
return tracks
Building a Trend Tracker
The real value comes from tracking changes over time:
def track_chart_movements(countries=None, interval_days=7):
"""Track chart position changes across countries."""
if countries is None:
countries = ["global", "us", "gb", "de", "jp", "br"]
all_data = []
for country in countries:
print(f"Scraping charts for {country}...")
tracks = scrape_spotify_charts(country)
for track in tracks:
track["country"] = country
track["date"] = pd.Timestamp.now().strftime("%Y-%m-%d")
all_data.append(track)
time.sleep(2) # Respect rate limits
df = pd.DataFrame(all_data)
df.to_csv(f"charts_{pd.Timestamp.now().strftime('%Y%m%d')}.csv", index=False)
return df
def analyze_trends(historical_dir="./data"):
"""Analyze chart trends from historical data."""
import glob
files = glob.glob(f"{historical_dir}/charts_*.csv")
all_charts = pd.concat([pd.read_csv(f) for f in files])
# Find fastest risers
risers = all_charts.groupby(["title", "artist"]).agg(
best_rank=("rank", "min"),
worst_rank=("rank", "max"),
appearances=("rank", "count")
).reset_index()
risers["climb"] = risers["worst_rank"] - risers["best_rank"]
return risers.sort_values("climb", ascending=False).head(20)
Monitoring Multiple Platforms
For comprehensive analysis, scrape across platforms and compare:
def cross_platform_compare(artist_name):
"""Compare artist presence across streaming platforms."""
# Monitor your scraping infrastructure
# Track success rates with https://scrapeops.io/?fpr=the-data28
platforms = {
"spotify": f"https://open.spotify.com/search/{artist_name}",
"apple_music": f"https://music.apple.com/search?term={artist_name}",
"deezer": f"https://www.deezer.com/search/{artist_name}"
}
results = {}
for platform, url in platforms.items():
params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
response = requests.get(BASE_URL, params=params)
results[platform] = parse_platform_data(platform, response.text)
time.sleep(3)
return results
Data Storage and Analysis
import sqlite3
def store_chart_data(tracks, db_path="music_data.db"):
"""Store scraped data in SQLite for analysis."""
conn = sqlite3.connect(db_path)
df = pd.DataFrame(tracks)
df.to_sql("charts", conn, if_exists="append", index=False)
conn.close()
Conclusion
By combining chart scraping, artist statistics, and cross-platform comparison, you can build powerful music analytics tools. Remember to respect robots.txt, use rate limiting, and consider using ScraperAPI for handling JavaScript rendering and proxy rotation at scale.
The complete code with scheduling and visualization is available in the examples above — adapt it to track the genres and markets that matter to your analysis.
Top comments (0)