Podcasts publish new episodes through RSS feeds, but not all podcast data is easily accessible. In this tutorial, we'll build a podcast episode tracker that combines RSS parsing with web scraping to aggregate episodes, transcripts, and metadata from multiple sources.
Why Build a Podcast Tracker?
- Never miss episodes from your favorite shows
- Search across podcasts — find episodes on specific topics
- Track publishing patterns — when do shows release new content?
- Build a personal archive of episode metadata and show notes
Setting Up
pip install feedparser requests beautifulsoup4 pandas
Parsing RSS Feeds
Most podcasts provide RSS feeds. The feedparser library makes parsing them trivial.
import feedparser
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class Episode:
title: str
published: str
duration: Optional[str]
description: str
audio_url: Optional[str]
show_name: str
link: str
def parse_podcast_feed(feed_url, show_name=None):
feed = feedparser.parse(feed_url)
show = show_name or feed.feed.get("title", "Unknown")
episodes = []
for entry in feed.entries:
audio_url = None
for link in entry.get("links", []):
if "audio" in link.get("type", ""):
audio_url = link["href"]
break
episodes.append(Episode(
title=entry.get("title", ""),
published=entry.get("published", ""),
duration=entry.get("itunes_duration", None),
description=entry.get("summary", "")[:500],
audio_url=audio_url,
show_name=show,
link=entry.get("link", "")
))
return episodes
# Example: Parse a tech podcast feed
episodes = parse_podcast_feed("https://feeds.simplecast.com/54nAGcIl", "The Changelog")
for ep in episodes[:3]:
print(f"{ep.show_name}: {ep.title} ({ep.published})")
Multi-Feed Aggregation
import time
class PodcastTracker:
def __init__(self):
self.feeds = {}
self.all_episodes = []
def add_feed(self, name, url):
self.feeds[name] = url
def refresh_all(self):
self.all_episodes = []
for name, url in self.feeds.items():
try:
episodes = parse_podcast_feed(url, name)
self.all_episodes.extend(episodes)
print(f"Fetched {len(episodes)} episodes from {name}")
time.sleep(1)
except Exception as e:
print(f"Error fetching {name}: {e}")
return self.all_episodes
def search(self, keyword):
keyword = keyword.lower()
return [
ep for ep in self.all_episodes
if keyword in ep.title.lower() or keyword in ep.description.lower()
]
def latest(self, n=10):
sorted_eps = sorted(
self.all_episodes,
key=lambda x: x.published,
reverse=True
)
return sorted_eps[:n]
tracker = PodcastTracker()
tracker.add_feed("Changelog", "https://feeds.simplecast.com/54nAGcIl")
tracker.add_feed("Talk Python", "https://talkpython.fm/episodes/rss")
tracker.add_feed("Python Bytes", "https://pythonbytes.fm/episodes/rss")
all_eps = tracker.refresh_all()
print(f"\nTotal episodes: {len(all_eps)}")
ai_episodes = tracker.search("machine learning")
print(f"ML episodes: {len(ai_episodes)}")
Scraping Additional Metadata
Some podcast directories have data not available in RSS feeds:
import requests
from bs4 import BeautifulSoup
def scrape_podcast_page(url, api_key=None):
if api_key:
fetch_url = f"http://api.scraperapi.com?api_key={api_key}&url={url}"
else:
fetch_url = url
response = requests.get(fetch_url)
soup = BeautifulSoup(response.text, "html.parser")
data = {
"title": "",
"rating": "",
"review_count": "",
"categories": []
}
title_el = soup.find("h1")
if title_el:
data["title"] = title_el.get_text(strip=True)
rating_el = soup.find("span", class_=lambda c: c and "rating" in c.lower() if c else False)
if rating_el:
data["rating"] = rating_el.get_text(strip=True)
return data
Using the iTunes Search API
Apple's iTunes API is a free way to discover podcasts:
def search_itunes(query, limit=10):
url = "https://itunes.apple.com/search"
params = {
"term": query,
"media": "podcast",
"limit": limit
}
response = requests.get(url, params=params)
data = response.json()
podcasts = []
for result in data.get("results", []):
podcasts.append({
"name": result["collectionName"],
"artist": result["artistName"],
"feed_url": result.get("feedUrl"),
"genre": result.get("primaryGenreName"),
"episode_count": result.get("trackCount", 0)
})
return podcasts
python_podcasts = search_itunes("python programming")
for pod in python_podcasts:
print(f"{pod['name']} by {pod['artist']} ({pod['episode_count']} episodes)")
Storing and Exporting Data
import pandas as pd
import json
def export_episodes(episodes, format="csv"):
data = [vars(ep) for ep in episodes]
df = pd.DataFrame(data)
if format == "csv":
df.to_csv("podcast_episodes.csv", index=False)
elif format == "json":
with open("podcast_episodes.json", "w") as f:
json.dump(data, f, indent=2, default=str)
print(f"Exported {len(episodes)} episodes as {format}")
return df
df = export_episodes(tracker.all_episodes)
print(df.groupby("show_name").size())
Automated Monitoring
import schedule
import hashlib
seen_episodes = set()
def check_new_episodes():
tracker.refresh_all()
for ep in tracker.all_episodes:
ep_hash = hashlib.md5(f"{ep.show_name}{ep.title}".encode()).hexdigest()
if ep_hash not in seen_episodes:
seen_episodes.add(ep_hash)
print(f"NEW: {ep.show_name} - {ep.title}")
schedule.every(30).minutes.do(check_new_episodes)
Scaling with Proxies
When scraping podcast directories at scale, use a proxy service like ScraperAPI to handle rate limiting and anti-bot protection. For monitoring your scraper health, ScrapeOps provides dashboards to track success rates. And for geo-specific podcast data, ThorData offers residential proxies targeting specific regions.
Conclusion
Combining RSS parsing with web scraping gives you comprehensive podcast tracking capabilities. RSS handles the core episode data, while scraping fills in ratings, reviews, and metadata from directories. This pattern works for any content aggregation project where structured feeds exist alongside unstructured web data.
Happy scraping!
Top comments (0)