Scraping Music Royalty and Licensing Data with Python

#python #tutorial #webdev #programming

Music royalty data is one of the most fragmented datasets in any industry. Rights holders, publishers, PROs (Performance Rights Organizations), and streaming platforms all track different slices of the same pie. Scraping and unifying this data unlocks real value for artists, labels, and music tech companies.

Why Music Royalty Data?

Artists want to verify they're getting paid correctly
Labels need to audit sub-publisher payments across territories
Music tech startups build royalty tracking dashboards
Investors use streaming data for catalog valuation

The data exists publicly on sites like ASCAP, BMI, and streaming analytics platforms, but nobody aggregates it well.

Setting Up

pip install requests beautifulsoup4 pandas

Scraping Public Performance Data

ASCAP and BMI both offer public repertory searches:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

SCRAPER_API_KEY = "YOUR_KEY"

def search_ascap_repertory(title: str = "", writer: str = "") -> list[dict]:
    params = {
        "api_key": SCRAPER_API_KEY,
        "url": f"https://www.ascap.com/repertory#/ace/search/workID/&title={title}&performer=&writersPerformers={writer}",
        "render": "true"
    }

    resp = requests.get("https://api.scraperapi.com", params=params, timeout=60)
    soup = BeautifulSoup(resp.text, "html.parser")

    works = []
    for row in soup.select(".search-results-row"):
        work = {
            "title": row.select_one(".work-title").get_text(strip=True) if row.select_one(".work-title") else "",
            "writers": [w.get_text(strip=True) for w in row.select(".writer-name")],
            "publishers": [p.get_text(strip=True) for p in row.select(".publisher-name")],
            "iswc": row.select_one(".iswc").get_text(strip=True) if row.select_one(".iswc") else "",
            "source": "ASCAP",
            "scraped_at": datetime.utcnow().isoformat()
        }
        works.append(work)
    return works

def search_bmi_repertory(title: str = "", writer: str = "") -> list[dict]:
    params = {
        "api_key": SCRAPER_API_KEY,
        "url": f"https://repertoire.bmi.com/Search/Search?Main_Search_Text={title}&Sub_Search_Text={writer}&Search_Type=all",
        "render": "true"
    }
    resp = requests.get("https://api.scraperapi.com", params=params, timeout=60)
    soup = BeautifulSoup(resp.text, "html.parser")

    works = []
    for row in soup.select(".search-results tbody tr"):
        cells = row.select("td")
        if len(cells) >= 4:
            works.append({
                "title": cells[0].get_text(strip=True),
                "writers": cells[1].get_text(strip=True).split("/"),
                "publishers": cells[2].get_text(strip=True).split("/"),
                "work_id": cells[3].get_text(strip=True),
                "source": "BMI",
            })
    return works

Streaming Royalty Rate Calculator

Combine scraped data with known royalty rates:

STREAMING_RATES = {
    "spotify": 0.004,
    "apple_music": 0.008,
    "amazon_music": 0.004,
    "youtube_music": 0.002,
    "tidal": 0.013,
    "deezer": 0.004,
}

def estimate_royalties(streams_by_platform: dict,
                       writer_share: float = 0.5) -> dict:
    total = 0
    breakdown = {}

    for platform, stream_count in streams_by_platform.items():
        rate = STREAMING_RATES.get(platform, 0.004)
        gross = stream_count * rate
        writer_amount = gross * writer_share

        breakdown[platform] = {
            "streams": stream_count,
            "rate_per_stream": rate,
            "gross_revenue": round(gross, 2),
            "writer_share": round(writer_amount, 2),
        }
        total += gross

    return {"total_gross": round(total, 2), "breakdown": breakdown}

result = estimate_royalties({
    "spotify": 1_000_000,
    "apple_music": 500_000,
    "youtube_music": 2_000_000
})
print(f"Estimated gross: ${result['total_gross']:,.2f}")

Cross-Referencing Rights Data

def cross_reference_works(title: str) -> pd.DataFrame:
    print(f"Searching ASCAP for '{title}'...")
    ascap = search_ascap_repertory(title=title)
    time.sleep(3)

    print(f"Searching BMI for '{title}'...")
    bmi = search_bmi_repertory(title=title)

    all_works = ascap + bmi
    df = pd.DataFrame(all_works)

    if not df.empty:
        print(f"Found {len(df)} registrations across {df['source'].nunique()} PROs")

    return df

cross_reference_works("Shape of You")

Proxy Requirements

Music industry sites use aggressive anti-bot measures. ScraperAPI handles JavaScript rendering required by ASCAP and BMI search pages. For geo-restricted streaming analytics, ThorData residential proxies let you access country-specific charts. Track pipeline health with ScrapeOps.

Business Applications

Royalty audit service for independent artists
Catalog valuation tool for music investors
Rights conflict detection across PROs
Streaming trend analytics dashboard for labels

The music data space is ripe for disruption. Major labels have internal tools for this, but independent artists and small labels are flying blind. Build the tool they need.