DEV Community

agenthustler
agenthustler

Posted on

Bandcamp Music Data Scraping: Extract Artists, Albums, and Prices with Python

Bandcamp is the leading platform for independent musicians to sell their music directly to fans. With millions of artists and albums, Bandcamp data is valuable for music market research, pricing analysis, and discovering emerging artists.

Here's how to scrape Bandcamp data with Python.

Use Cases

  • Music market research: Analyze pricing trends across genres
  • Artist discovery: Find emerging artists by sales data and reviews
  • Pricing strategy: Compare how artists price their work
  • Genre analysis: Map the indie music landscape
  • Label intelligence: Track independent label catalogs

Scraping Album Pages

import requests
from bs4 import BeautifulSoup
import json
import re
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}

def scrape_album(album_url):
    """Extract album details from a Bandcamp album page."""
    response = requests.get(album_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    # Bandcamp embeds structured data in JSON-LD
    ld_json = soup.find("script", {"type": "application/ld+json"})
    if ld_json:
        data = json.loads(ld_json.string)
    else:
        data = {}

    # Extract from meta tags and page elements
    title = soup.select_one("h2.trackTitle")
    artist = soup.select_one("span[itemprop='byArtist'] a")
    price_el = soup.select_one(".buyItem .base-text-color")

    # Get track listing
    tracks = []
    track_rows = soup.select("table.track_list tr.track_row_view")
    for row in track_rows:
        track_title = row.select_one(".track-title")
        track_duration = row.select_one(".time")
        tracks.append({
            "title": track_title.text.strip() if track_title else "",
            "duration": track_duration.text.strip() if track_duration else "",
        })

    # Get tags
    tags = [tag.text.strip() for tag in soup.select(".tralbumData .tag")]

    return {
        "title": title.text.strip() if title else data.get("name", ""),
        "artist": artist.text.strip() if artist else "",
        "url": album_url,
        "price": price_el.text.strip() if price_el else "Name Your Price",
        "release_date": data.get("datePublished", ""),
        "tracks": tracks,
        "num_tracks": len(tracks),
        "tags": tags,
        "image_url": data.get("image", ""),
        "description": data.get("description", ""),
    }
Enter fullscreen mode Exit fullscreen mode

Scraping Artist Pages

def scrape_artist(artist_url):
    """Extract artist info and discography from Bandcamp."""
    response = requests.get(artist_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    # Artist name
    name = soup.select_one("#band-name-location .title")
    location = soup.select_one("#band-name-location .location")

    # Bio
    bio = soup.select_one(".signed-out-artists-bio-text")

    # Discography
    albums = []
    disc_items = soup.select(".music-grid-item")
    for item in disc_items:
        link = item.select_one("a")
        title = item.select_one(".title")
        albums.append({
            "title": title.text.strip() if title else "",
            "url": link["href"] if link and link.get("href") else "",
        })

    return {
        "name": name.text.strip() if name else "",
        "location": location.text.strip() if location else "",
        "bio": bio.text.strip() if bio else "",
        "discography": albums,
        "album_count": len(albums),
    }
Enter fullscreen mode Exit fullscreen mode

Exploring Genre Tags

def scrape_tag_page(tag, page=1):
    """Get albums from a Bandcamp tag/genre page."""
    url = f"https://bandcamp.com/tag/{tag}?page={page}"
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    albums = []
    items = soup.select(".item_list .item")

    for item in items:
        title = item.select_one(".itemtext")
        artist = item.select_one(".itemsubtext")
        link = item.select_one("a")

        albums.append({
            "title": title.text.strip() if title else "",
            "artist": artist.text.strip() if artist else "",
            "url": link["href"] if link else "",
            "tag": tag,
        })

    return albums


def scrape_genre(tag, max_pages=5):
    """Scrape multiple pages of a genre tag."""
    all_albums = []
    for page in range(1, max_pages + 1):
        albums = scrape_tag_page(tag, page)
        if not albums:
            break
        all_albums.extend(albums)
        print(f"Tag '{tag}' page {page}: {len(albums)} albums")
        time.sleep(2)
    return all_albums
Enter fullscreen mode Exit fullscreen mode

Price Analysis

def analyze_pricing(albums_with_details):
    """Analyze pricing patterns across albums."""
    prices = []
    name_your_price = 0
    free = 0

    for album in albums_with_details:
        price_str = album.get("price", "")
        if "name your price" in price_str.lower():
            name_your_price += 1
        elif "free" in price_str.lower():
            free += 1
        else:
            # Extract numeric price
            match = re.search(r'[\$€£](\d+\.?\d*)', price_str)
            if match:
                prices.append(float(match.group(1)))

    return {
        "total_albums": len(albums_with_details),
        "paid_albums": len(prices),
        "name_your_price": name_your_price,
        "free_albums": free,
        "avg_price": round(sum(prices) / len(prices), 2) if prices else 0,
        "min_price": min(prices) if prices else 0,
        "max_price": max(prices) if prices else 0,
        "median_price": sorted(prices)[len(prices)//2] if prices else 0,
    }
Enter fullscreen mode Exit fullscreen mode

Production Bandcamp Scraping

For large-scale Bandcamp data extraction, the Bandcamp Scraper on Apify handles complex pagination, dynamic content loading, and data normalization automatically. Perfect for building comprehensive music databases.

When scraping at scale, use ThorData proxies to distribute requests across residential IPs and avoid rate limits.

Saving Data

import csv

def save_albums_csv(albums, filename="bandcamp_albums.csv"):
    if not albums:
        return

    # Flatten tracks list for CSV
    flat_albums = []
    for album in albums:
        flat = {k: v for k, v in album.items() if k != "tracks"}
        flat["tags"] = ", ".join(album.get("tags", []))
        flat_albums.append(flat)

    keys = flat_albums[0].keys()
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(flat_albums)

    print(f"Saved {len(flat_albums)} albums to {filename}")
Enter fullscreen mode Exit fullscreen mode

Best Practices

  1. Use JSON-LD data: Bandcamp embeds structured data — parse it first before scraping HTML
  2. Rate limit: 2-3 seconds between requests
  3. Use ThorData for residential proxies when scraping at volume
  4. Respect artist content: Scrape metadata, not actual audio files
  5. Cache results: Album data rarely changes after release

Conclusion

Bandcamp is a treasure trove of indie music data. From pricing trends to genre analysis, the data powers valuable market insights. Use the techniques above for small projects, or the Bandcamp Scraper on Apify for production workloads.

Happy data mining!

Top comments (0)