How to Scrape Conference Speaker Lineups for Trend Detection
Conference speaker lineups are a leading indicator of industry trends. When multiple conferences simultaneously feature talks on a topic, it signals emerging demand months before mainstream adoption. Let's build a scraper that tracks speaker lineups across tech conferences and identifies trending topics.
Why Conference Data Matters
By the time a topic appears in a Gartner report, it's already mainstream. Conference organizers curate lineups 3-6 months ahead based on what's gaining traction. Tracking these patterns gives you an early warning system for industry trends.
Building the Conference Scraper
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
SCRAPER_API_KEY = "YOUR_KEY"
class ConferenceScraper:
def scrape_speakers(self, conference_url):
response = requests.get(
"http://api.scraperapi.com",
params={
"api_key": SCRAPER_API_KEY,
"url": conference_url,
"render": "true"
},
timeout=60
)
soup = BeautifulSoup(response.text, "html.parser")
speaker_links = []
for link in soup.find_all("a", href=True):
href = link["href"].lower()
text = link.get_text().lower()
if any(kw in href or kw in text for kw in ["speaker", "schedule", "agenda", "session"]):
full_url = urljoin(conference_url, link["href"])
speaker_links.append(full_url)
speakers = []
for page_url in set(speaker_links[:5]):
page_speakers = self._scrape_speaker_page(page_url)
speakers.extend(page_speakers)
return speakers
def _scrape_speaker_page(self, url):
response = requests.get(
"http://api.scraperapi.com",
params={"api_key": SCRAPER_API_KEY, "url": url, "render": "true"},
timeout=60
)
soup = BeautifulSoup(response.text, "html.parser")
speakers = []
cards = soup.select(
".speaker-card, .speaker-item, .speaker, "
"[class*='speaker'], [class*='presenter']"
)
for card in cards:
name_el = card.select_one("h2, h3, h4, .speaker-name, .name")
title_el = card.select_one(".title, .role, .position, .company")
talk_el = card.select_one(".talk-title, .session-title, .topic")
if name_el:
speakers.append({
"name": name_el.get_text(strip=True),
"title": title_el.get_text(strip=True) if title_el else "",
"talk": talk_el.get_text(strip=True) if talk_el else "",
"source_url": url
})
return speakers
Topic Extraction and Clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np
class TopicAnalyzer:
STOP_WORDS = {
"talk", "session", "keynote", "workshop", "panel",
"introduction", "building", "using", "deep", "dive"
}
def extract_topics(self, speakers):
talks = [s["talk"] for s in speakers if s.get("talk")]
if not talks:
return []
vectorizer = TfidfVectorizer(
max_features=1000, stop_words="english",
ngram_range=(1, 3), min_df=2
)
tfidf = vectorizer.fit_transform(talks)
feature_names = vectorizer.get_feature_names_out()
mean_scores = np.asarray(tfidf.mean(axis=0)).flatten()
top_indices = mean_scores.argsort()[-30:][::-1]
topics = []
for idx in top_indices:
term = feature_names[idx]
if term.lower() not in self.STOP_WORDS:
topics.append({
"term": term,
"score": float(mean_scores[idx]),
"frequency": int((tfidf[:, idx] > 0).sum())
})
return topics
def detect_trends(self, current_topics, previous_topics):
current_dict = {t["term"]: t["score"] for t in current_topics}
previous_dict = {t["term"]: t["score"] for t in previous_topics}
trends = []
for term, score in current_dict.items():
prev_score = previous_dict.get(term, 0)
if prev_score == 0 and score > 0.01:
trends.append({"term": term, "type": "NEW", "score": score})
elif score > prev_score * 1.5:
growth = (score - prev_score) / max(prev_score, 0.001)
trends.append({"term": term, "type": "GROWING", "growth": round(growth, 2)})
return sorted(trends, key=lambda x: x.get("score", x.get("growth", 0)), reverse=True)
Cross-Conference Analysis
def cross_conference_analysis(conference_data):
topic_conference_map = {}
for conf_name, speakers in conference_data.items():
analyzer = TopicAnalyzer()
topics = analyzer.extract_topics(speakers)
for topic in topics:
term = topic["term"]
if term not in topic_conference_map:
topic_conference_map[term] = []
topic_conference_map[term].append({
"conference": conf_name,
"score": topic["score"]
})
trending = {
term: confs for term, confs in topic_conference_map.items()
if len(confs) >= 3
}
return dict(sorted(trending.items(), key=lambda x: len(x[1]), reverse=True))
Scaling with Proxy Infrastructure
Scraping dozens of conference sites requires reliable proxies. ScraperAPI handles JavaScript-rendered conference pages. For geo-specific conferences, ThorData residential proxies work well. ScrapeOps monitors scraping health.
Running the Full Pipeline
conferences = {
"PyCon 2026": "https://us.pycon.org/2026/schedule/",
"KubeCon EU": "https://events.linuxfoundation.org/kubecon/",
"AWS re:Invent": "https://reinvent.awsevents.com/sessions/"
}
scraper = ConferenceScraper()
all_data = {}
for name, url in conferences.items():
speakers = scraper.scrape_speakers(url)
all_data[name] = speakers
print(f"{name}: {len(speakers)} speakers")
trends = cross_conference_analysis(all_data)
for topic, confs in list(trends.items())[:10]:
print(f"TRENDING: {topic} ({len(confs)} conferences)")
Conference lineups are one of the most underutilized data sources in tech. Start tracking them systematically and you'll spot trends months before they hit mainstream awareness.
Top comments (0)