Podcasts are a goldmine of market intelligence. Founders share candid insights, experts discuss trends, and industry insiders reveal information you won't find in formal reports. Scraping transcript databases makes this searchable.
Where to Find Transcripts
Several platforms host podcast transcripts: Podscribe, Podcasts.apple.com (with transcripts), and individual podcast websites. Many podcasts also auto-generate transcripts through hosting platforms.
Transcript Scraper
pip install requests beautifulsoup4 pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
class PodcastTranscriptScraper:
def __init__(self, api_key):
self.api_key = api_key
def fetch(self, url):
proxy = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}"
return requests.get(proxy, timeout=30)
def scrape_transcript_page(self, url):
resp = self.fetch(url)
soup = BeautifulSoup(resp.text, "html.parser")
# Common transcript containers
transcript_el = soup.select_one(
".transcript, .episode-transcript, "
"[data-testid='transcript'], .post-body"
)
if not transcript_el:
# Fallback: find largest text block
paragraphs = soup.find_all("p")
if paragraphs:
longest = max(paragraphs, key=lambda p: len(p.text))
if len(longest.text) > 200:
transcript_el = longest.parent
if transcript_el:
return transcript_el.get_text(separator="\n", strip=True)
return ""
def search_podcast_directory(self, query, platform_url):
search_url = f"{platform_url}/search?q={query}"
resp = self.fetch(search_url)
soup = BeautifulSoup(resp.text, "html.parser")
episodes = []
for item in soup.select(".episode, .search-result, .podcast-episode"):
title = item.select_one("h2, h3, .episode-title")
link = item.select_one("a")
date = item.select_one("time, .date, .published")
show = item.select_one(".show-name, .podcast-name")
if title and link:
episodes.append({
"title": title.text.strip(),
"url": link.get("href", ""),
"date": date.text.strip() if date else "",
"show": show.text.strip() if show else "",
})
return episodes
def extract_market_mentions(self, transcript, keywords):
sentences = transcript.split(".")
mentions = []
for i, sentence in enumerate(sentences):
for keyword in keywords:
if keyword.lower() in sentence.lower():
# Get surrounding context
start = max(0, i - 1)
end = min(len(sentences), i + 2)
context = ". ".join(sentences[start:end]).strip()
mentions.append({
"keyword": keyword,
"context": context[:500],
"position": i / len(sentences)
})
return mentions
def analyze_topic_frequency(self, transcripts, topics):
frequency = {topic: 0 for topic in topics}
for transcript in transcripts:
text_lower = transcript.lower()
for topic in topics:
frequency[topic] += text_lower.count(topic.lower())
return dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
# Usage
scraper = PodcastTranscriptScraper("YOUR_SCRAPERAPI_KEY")
# Search for relevant episodes
episodes = scraper.search_podcast_directory(
"AI startups funding",
"https://podcastindex.org"
)
print(f"Found {len(episodes)} relevant episodes")
# Scrape and analyze transcript
if episodes:
transcript = scraper.scrape_transcript_page(episodes[0]["url"])
keywords = ["Series A", "revenue", "ARR", "growth rate", "churn"]
mentions = scraper.extract_market_mentions(transcript, keywords)
for m in mentions[:5]:
print(f"\n[{m['keyword']}]: {m['context'][:200]}...")
Batch Analysis for Market Research
def market_research_scan(scraper, query, keywords, max_episodes=20):
episodes = scraper.search_podcast_directory(query, "https://podcastindex.org")
transcripts = []
for ep in episodes[:max_episodes]:
transcript = scraper.scrape_transcript_page(ep["url"])
if transcript:
transcripts.append(transcript)
topics = ["AI", "machine learning", "automation", "API", "SaaS",
"enterprise", "startup", "funding", "acquisition"]
freq = scraper.analyze_topic_frequency(transcripts, topics)
print(f"\nTopic frequency across {len(transcripts)} transcripts:")
for topic, count in freq.items():
print(f" {topic}: {count} mentions")
return freq
research = market_research_scan(scraper, "tech startups 2026", keywords)
Proxy Infrastructure
Podcast platforms vary widely in their anti-scraping measures. ScraperAPI handles JS rendering for modern podcast sites. ThorData proxies help with geo-restricted content. Track scraping jobs with ScrapeOps.
Conclusion
Podcast transcripts contain unfiltered market intelligence that's harder to find in traditional research. Building a transcript scraper and analysis pipeline turns hours of listening into searchable, quantifiable market data. Start with topics in your industry and expand as you find signal.
Top comments (0)