agenthustler

Posted on Mar 26

Building a Podcast Episode Tracker with RSS and Web Scraping

#python #tutorial #webdev #programming

Podcasts publish new episodes through RSS feeds, but not all podcast data is easily accessible. In this tutorial, we'll build a podcast episode tracker that combines RSS parsing with web scraping to aggregate episodes, transcripts, and metadata from multiple sources.

Why Build a Podcast Tracker?

Never miss episodes from your favorite shows
Search across podcasts — find episodes on specific topics
Track publishing patterns — when do shows release new content?
Build a personal archive of episode metadata and show notes

Setting Up

pip install feedparser requests beautifulsoup4 pandas

Parsing RSS Feeds

Most podcasts provide RSS feeds. The feedparser library makes parsing them trivial.

import feedparser
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional

@dataclass
class Episode:
    title: str
    published: str
    duration: Optional[str]
    description: str
    audio_url: Optional[str]
    show_name: str
    link: str

def parse_podcast_feed(feed_url, show_name=None):
    feed = feedparser.parse(feed_url)
    show = show_name or feed.feed.get("title", "Unknown")

    episodes = []
    for entry in feed.entries:
        audio_url = None
        for link in entry.get("links", []):
            if "audio" in link.get("type", ""):
                audio_url = link["href"]
                break

        episodes.append(Episode(
            title=entry.get("title", ""),
            published=entry.get("published", ""),
            duration=entry.get("itunes_duration", None),
            description=entry.get("summary", "")[:500],
            audio_url=audio_url,
            show_name=show,
            link=entry.get("link", "")
        ))
    return episodes

# Example: Parse a tech podcast feed
episodes = parse_podcast_feed("https://feeds.simplecast.com/54nAGcIl", "The Changelog")
for ep in episodes[:3]:
    print(f"{ep.show_name}: {ep.title} ({ep.published})")

Multi-Feed Aggregation

import time

class PodcastTracker:
    def __init__(self):
        self.feeds = {}
        self.all_episodes = []

    def add_feed(self, name, url):
        self.feeds[name] = url

    def refresh_all(self):
        self.all_episodes = []
        for name, url in self.feeds.items():
            try:
                episodes = parse_podcast_feed(url, name)
                self.all_episodes.extend(episodes)
                print(f"Fetched {len(episodes)} episodes from {name}")
                time.sleep(1)
            except Exception as e:
                print(f"Error fetching {name}: {e}")
        return self.all_episodes

    def search(self, keyword):
        keyword = keyword.lower()
        return [
            ep for ep in self.all_episodes
            if keyword in ep.title.lower() or keyword in ep.description.lower()
        ]

    def latest(self, n=10):
        sorted_eps = sorted(
            self.all_episodes,
            key=lambda x: x.published,
            reverse=True
        )
        return sorted_eps[:n]

tracker = PodcastTracker()
tracker.add_feed("Changelog", "https://feeds.simplecast.com/54nAGcIl")
tracker.add_feed("Talk Python", "https://talkpython.fm/episodes/rss")
tracker.add_feed("Python Bytes", "https://pythonbytes.fm/episodes/rss")

all_eps = tracker.refresh_all()
print(f"\nTotal episodes: {len(all_eps)}")

ai_episodes = tracker.search("machine learning")
print(f"ML episodes: {len(ai_episodes)}")

Scraping Additional Metadata

Some podcast directories have data not available in RSS feeds:

import requests
from bs4 import BeautifulSoup

def scrape_podcast_page(url, api_key=None):
    if api_key:
        fetch_url = f"http://api.scraperapi.com?api_key={api_key}&url={url}"
    else:
        fetch_url = url

    response = requests.get(fetch_url)
    soup = BeautifulSoup(response.text, "html.parser")

    data = {
        "title": "",
        "rating": "",
        "review_count": "",
        "categories": []
    }

    title_el = soup.find("h1")
    if title_el:
        data["title"] = title_el.get_text(strip=True)

    rating_el = soup.find("span", class_=lambda c: c and "rating" in c.lower() if c else False)
    if rating_el:
        data["rating"] = rating_el.get_text(strip=True)

    return data

Using the iTunes Search API

Apple's iTunes API is a free way to discover podcasts:

def search_itunes(query, limit=10):
    url = "https://itunes.apple.com/search"
    params = {
        "term": query,
        "media": "podcast",
        "limit": limit
    }
    response = requests.get(url, params=params)
    data = response.json()

    podcasts = []
    for result in data.get("results", []):
        podcasts.append({
            "name": result["collectionName"],
            "artist": result["artistName"],
            "feed_url": result.get("feedUrl"),
            "genre": result.get("primaryGenreName"),
            "episode_count": result.get("trackCount", 0)
        })
    return podcasts

python_podcasts = search_itunes("python programming")
for pod in python_podcasts:
    print(f"{pod['name']} by {pod['artist']} ({pod['episode_count']} episodes)")

Storing and Exporting Data

import pandas as pd
import json

def export_episodes(episodes, format="csv"):
    data = [vars(ep) for ep in episodes]
    df = pd.DataFrame(data)

    if format == "csv":
        df.to_csv("podcast_episodes.csv", index=False)
    elif format == "json":
        with open("podcast_episodes.json", "w") as f:
            json.dump(data, f, indent=2, default=str)

    print(f"Exported {len(episodes)} episodes as {format}")
    return df

df = export_episodes(tracker.all_episodes)
print(df.groupby("show_name").size())

Automated Monitoring

import schedule
import hashlib

seen_episodes = set()

def check_new_episodes():
    tracker.refresh_all()
    for ep in tracker.all_episodes:
        ep_hash = hashlib.md5(f"{ep.show_name}{ep.title}".encode()).hexdigest()
        if ep_hash not in seen_episodes:
            seen_episodes.add(ep_hash)
            print(f"NEW: {ep.show_name} - {ep.title}")

schedule.every(30).minutes.do(check_new_episodes)

Scaling with Proxies

When scraping podcast directories at scale, use a proxy service like ScraperAPI to handle rate limiting and anti-bot protection. For monitoring your scraper health, ScrapeOps provides dashboards to track success rates. And for geo-specific podcast data, ThorData offers residential proxies targeting specific regions.

Conclusion

Combining RSS parsing with web scraping gives you comprehensive podcast tracking capabilities. RSS handles the core episode data, while scraping fills in ratings, reviews, and metadata from directories. This pattern works for any content aggregation project where structured feeds exist alongside unstructured web data.

Happy scraping!

DEV Community