Scraping Podcast Transcript Databases for Market Research

#python #tutorial #webdev #programming

Podcasts are a goldmine of market intelligence. Founders share candid insights, experts discuss trends, and industry insiders reveal information you won't find in formal reports. Scraping transcript databases makes this searchable.

Where to Find Transcripts

Several platforms host podcast transcripts: Podscribe, Podcasts.apple.com (with transcripts), and individual podcast websites. Many podcasts also auto-generate transcripts through hosting platforms.

Transcript Scraper

pip install requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

class PodcastTranscriptScraper:
    def __init__(self, api_key):
        self.api_key = api_key

    def fetch(self, url):
        proxy = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}"
        return requests.get(proxy, timeout=30)

    def scrape_transcript_page(self, url):
        resp = self.fetch(url)
        soup = BeautifulSoup(resp.text, "html.parser")

        # Common transcript containers
        transcript_el = soup.select_one(
            ".transcript, .episode-transcript, "
            "[data-testid='transcript'], .post-body"
        )

        if not transcript_el:
            # Fallback: find largest text block
            paragraphs = soup.find_all("p")
            if paragraphs:
                longest = max(paragraphs, key=lambda p: len(p.text))
                if len(longest.text) > 200:
                    transcript_el = longest.parent

        if transcript_el:
            return transcript_el.get_text(separator="\n", strip=True)
        return ""

    def search_podcast_directory(self, query, platform_url):
        search_url = f"{platform_url}/search?q={query}"
        resp = self.fetch(search_url)
        soup = BeautifulSoup(resp.text, "html.parser")

        episodes = []
        for item in soup.select(".episode, .search-result, .podcast-episode"):
            title = item.select_one("h2, h3, .episode-title")
            link = item.select_one("a")
            date = item.select_one("time, .date, .published")
            show = item.select_one(".show-name, .podcast-name")

            if title and link:
                episodes.append({
                    "title": title.text.strip(),
                    "url": link.get("href", ""),
                    "date": date.text.strip() if date else "",
                    "show": show.text.strip() if show else "",
                })
        return episodes

    def extract_market_mentions(self, transcript, keywords):
        sentences = transcript.split(".")
        mentions = []

        for i, sentence in enumerate(sentences):
            for keyword in keywords:
                if keyword.lower() in sentence.lower():
                    # Get surrounding context
                    start = max(0, i - 1)
                    end = min(len(sentences), i + 2)
                    context = ". ".join(sentences[start:end]).strip()

                    mentions.append({
                        "keyword": keyword,
                        "context": context[:500],
                        "position": i / len(sentences)
                    })
        return mentions

    def analyze_topic_frequency(self, transcripts, topics):
        frequency = {topic: 0 for topic in topics}
        for transcript in transcripts:
            text_lower = transcript.lower()
            for topic in topics:
                frequency[topic] += text_lower.count(topic.lower())
        return dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))

# Usage
scraper = PodcastTranscriptScraper("YOUR_SCRAPERAPI_KEY")

# Search for relevant episodes
episodes = scraper.search_podcast_directory(
    "AI startups funding",
    "https://podcastindex.org"
)
print(f"Found {len(episodes)} relevant episodes")

# Scrape and analyze transcript
if episodes:
    transcript = scraper.scrape_transcript_page(episodes[0]["url"])
    keywords = ["Series A", "revenue", "ARR", "growth rate", "churn"]
    mentions = scraper.extract_market_mentions(transcript, keywords)
    for m in mentions[:5]:
        print(f"\n[{m['keyword']}]: {m['context'][:200]}...")

Batch Analysis for Market Research

def market_research_scan(scraper, query, keywords, max_episodes=20):
    episodes = scraper.search_podcast_directory(query, "https://podcastindex.org")
    transcripts = []

    for ep in episodes[:max_episodes]:
        transcript = scraper.scrape_transcript_page(ep["url"])
        if transcript:
            transcripts.append(transcript)

    topics = ["AI", "machine learning", "automation", "API", "SaaS",
              "enterprise", "startup", "funding", "acquisition"]
    freq = scraper.analyze_topic_frequency(transcripts, topics)

    print(f"\nTopic frequency across {len(transcripts)} transcripts:")
    for topic, count in freq.items():
        print(f"  {topic}: {count} mentions")

    return freq

research = market_research_scan(scraper, "tech startups 2026", keywords)

Proxy Infrastructure

Podcast platforms vary widely in their anti-scraping measures. ScraperAPI handles JS rendering for modern podcast sites. ThorData proxies help with geo-restricted content. Track scraping jobs with ScrapeOps.

Conclusion

Podcast transcripts contain unfiltered market intelligence that's harder to find in traditional research. Building a transcript scraper and analysis pipeline turns hours of listening into searchable, quantifiable market data. Start with topics in your industry and expand as you find signal.

DEV Community