DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Conference Speaker Lineups for Trend Detection

How to Scrape Conference Speaker Lineups for Trend Detection

Conference speaker lineups are a leading indicator of industry trends. When multiple conferences simultaneously feature talks on a topic, it signals emerging demand months before mainstream adoption. Let's build a scraper that tracks speaker lineups across tech conferences and identifies trending topics.

Why Conference Data Matters

By the time a topic appears in a Gartner report, it's already mainstream. Conference organizers curate lineups 3-6 months ahead based on what's gaining traction. Tracking these patterns gives you an early warning system for industry trends.

Building the Conference Scraper

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

SCRAPER_API_KEY = "YOUR_KEY"

class ConferenceScraper:
    def scrape_speakers(self, conference_url):
        response = requests.get(
            "http://api.scraperapi.com",
            params={
                "api_key": SCRAPER_API_KEY,
                "url": conference_url,
                "render": "true"
            },
            timeout=60
        )
        soup = BeautifulSoup(response.text, "html.parser")
        speaker_links = []
        for link in soup.find_all("a", href=True):
            href = link["href"].lower()
            text = link.get_text().lower()
            if any(kw in href or kw in text for kw in ["speaker", "schedule", "agenda", "session"]):
                full_url = urljoin(conference_url, link["href"])
                speaker_links.append(full_url)

        speakers = []
        for page_url in set(speaker_links[:5]):
            page_speakers = self._scrape_speaker_page(page_url)
            speakers.extend(page_speakers)
        return speakers

    def _scrape_speaker_page(self, url):
        response = requests.get(
            "http://api.scraperapi.com",
            params={"api_key": SCRAPER_API_KEY, "url": url, "render": "true"},
            timeout=60
        )
        soup = BeautifulSoup(response.text, "html.parser")
        speakers = []
        cards = soup.select(
            ".speaker-card, .speaker-item, .speaker, "
            "[class*='speaker'], [class*='presenter']"
        )
        for card in cards:
            name_el = card.select_one("h2, h3, h4, .speaker-name, .name")
            title_el = card.select_one(".title, .role, .position, .company")
            talk_el = card.select_one(".talk-title, .session-title, .topic")
            if name_el:
                speakers.append({
                    "name": name_el.get_text(strip=True),
                    "title": title_el.get_text(strip=True) if title_el else "",
                    "talk": talk_el.get_text(strip=True) if talk_el else "",
                    "source_url": url
                })
        return speakers
Enter fullscreen mode Exit fullscreen mode

Topic Extraction and Clustering

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np

class TopicAnalyzer:
    STOP_WORDS = {
        "talk", "session", "keynote", "workshop", "panel",
        "introduction", "building", "using", "deep", "dive"
    }

    def extract_topics(self, speakers):
        talks = [s["talk"] for s in speakers if s.get("talk")]
        if not talks:
            return []
        vectorizer = TfidfVectorizer(
            max_features=1000, stop_words="english",
            ngram_range=(1, 3), min_df=2
        )
        tfidf = vectorizer.fit_transform(talks)
        feature_names = vectorizer.get_feature_names_out()
        mean_scores = np.asarray(tfidf.mean(axis=0)).flatten()
        top_indices = mean_scores.argsort()[-30:][::-1]
        topics = []
        for idx in top_indices:
            term = feature_names[idx]
            if term.lower() not in self.STOP_WORDS:
                topics.append({
                    "term": term,
                    "score": float(mean_scores[idx]),
                    "frequency": int((tfidf[:, idx] > 0).sum())
                })
        return topics

    def detect_trends(self, current_topics, previous_topics):
        current_dict = {t["term"]: t["score"] for t in current_topics}
        previous_dict = {t["term"]: t["score"] for t in previous_topics}
        trends = []
        for term, score in current_dict.items():
            prev_score = previous_dict.get(term, 0)
            if prev_score == 0 and score > 0.01:
                trends.append({"term": term, "type": "NEW", "score": score})
            elif score > prev_score * 1.5:
                growth = (score - prev_score) / max(prev_score, 0.001)
                trends.append({"term": term, "type": "GROWING", "growth": round(growth, 2)})
        return sorted(trends, key=lambda x: x.get("score", x.get("growth", 0)), reverse=True)
Enter fullscreen mode Exit fullscreen mode

Cross-Conference Analysis

def cross_conference_analysis(conference_data):
    topic_conference_map = {}
    for conf_name, speakers in conference_data.items():
        analyzer = TopicAnalyzer()
        topics = analyzer.extract_topics(speakers)
        for topic in topics:
            term = topic["term"]
            if term not in topic_conference_map:
                topic_conference_map[term] = []
            topic_conference_map[term].append({
                "conference": conf_name,
                "score": topic["score"]
            })
    trending = {
        term: confs for term, confs in topic_conference_map.items()
        if len(confs) >= 3
    }
    return dict(sorted(trending.items(), key=lambda x: len(x[1]), reverse=True))
Enter fullscreen mode Exit fullscreen mode

Scaling with Proxy Infrastructure

Scraping dozens of conference sites requires reliable proxies. ScraperAPI handles JavaScript-rendered conference pages. For geo-specific conferences, ThorData residential proxies work well. ScrapeOps monitors scraping health.

Running the Full Pipeline

conferences = {
    "PyCon 2026": "https://us.pycon.org/2026/schedule/",
    "KubeCon EU": "https://events.linuxfoundation.org/kubecon/",
    "AWS re:Invent": "https://reinvent.awsevents.com/sessions/"
}

scraper = ConferenceScraper()
all_data = {}
for name, url in conferences.items():
    speakers = scraper.scrape_speakers(url)
    all_data[name] = speakers
    print(f"{name}: {len(speakers)} speakers")

trends = cross_conference_analysis(all_data)
for topic, confs in list(trends.items())[:10]:
    print(f"TRENDING: {topic} ({len(confs)} conferences)")
Enter fullscreen mode Exit fullscreen mode

Conference lineups are one of the most underutilized data sources in tech. Start tracking them systematically and you'll spot trends months before they hit mainstream awareness.

Top comments (0)