Scraping Venture-Backed Startup Job Boards for Talent Signals

#python #tutorial #webdev #programming

When a startup suddenly posts 15 engineering roles, something is happening — new funding, product pivot, or scaling push. Scraping VC portfolio job boards reveals these signals before press releases.

The Signal in Job Postings

Major VCs like a16z, Sequoia, and Y Combinator maintain job boards aggregating positions across their portfolio companies. Changes in hiring patterns are leading indicators of strategic shifts.

Scraping YC's Work at a Startup

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

API_KEY = "YOUR_SCRAPERAPI_KEY"  # Get one at https://www.scraperapi.com?fp_ref=the52

def scrape_yc_jobs():
    url = "https://www.workatastartup.com/jobs"
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}&render=true"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    jobs = []
    for card in soup.select('.job-card, .job-listing, [class*="job"]'):
        company = card.select_one('.company-name, [class*="company"]')
        title = card.select_one('.job-title, [class*="title"]')
        location = card.select_one('.location, [class*="location"]')

        if title:
            jobs.append({
                'source': 'YC',
                'company': company.text.strip() if company else 'Unknown',
                'title': title.text.strip(),
                'location': location.text.strip() if location else 'Remote',
                'scraped_at': datetime.now().isoformat()
            })
    return jobs

def scrape_a16z_jobs():
    url = "https://jobs.a16z.com/"
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}&render=true"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    jobs = []
    for card in soup.select('.job-card, .position, [class*="job"]'):
        company = card.select_one('.company, [class*="company"]')
        title = card.select_one('.title, [class*="title"]')

        if title:
            jobs.append({
                'source': 'a16z',
                'company': company.text.strip() if company else 'Unknown',
                'title': title.text.strip(),
                'location': 'N/A',
                'scraped_at': datetime.now().isoformat()
            })
    return jobs

yc_jobs = scrape_yc_jobs()
a16z_jobs = scrape_a16z_jobs()
all_jobs = yc_jobs + a16z_jobs
df = pd.DataFrame(all_jobs)
print(f"Total jobs scraped: {len(df)}")

Analyzing Hiring Signals

def analyze_hiring_signals(df):
    company_counts = df.groupby(['source', 'company']).size().reset_index(name='open_roles')
    company_counts = company_counts.sort_values('open_roles', ascending=False)

    print("\nTop hiring companies:")
    print(company_counts.head(20).to_string(index=False))

    role_categories = {
        'engineering': ['engineer', 'developer', 'sre', 'devops', 'backend', 'frontend', 'fullstack'],
        'ai_ml': ['machine learning', 'ai', 'data scientist', 'ml engineer', 'llm'],
        'sales': ['sales', 'account executive', 'sdr', 'business development'],
        'product': ['product manager', 'product designer', 'ux'],
    }

    for category, keywords in role_categories.items():
        mask = df['title'].str.lower().apply(lambda x: any(kw in x for kw in keywords))
        count = mask.sum()
        print(f"{category}: {count} openings ({count/len(df)*100:.1f}%)")

    return company_counts

signals = analyze_hiring_signals(df)

Detecting Hiring Surges

Compare snapshots to detect companies ramping up. Use ThorData for residential proxies:

import sqlite3

conn = sqlite3.connect('startup_jobs.db')

def detect_surges(current_df, threshold=5):
    current_counts = current_df.groupby('company').size().reset_index(name='current')

    previous = pd.read_sql('''
        SELECT company, COUNT(*) as previous FROM jobs 
        WHERE scraped_at > datetime('now', '-7 days')
        GROUP BY company
    ''', conn)

    merged = current_counts.merge(previous, on='company', how='left').fillna(0)
    merged['change'] = merged['current'] - merged['previous']
    surges = merged[merged['change'] >= threshold].sort_values('change', ascending=False)

    if len(surges) > 0:
        print("\nHIRING SURGES DETECTED:")
        print(surges.to_string(index=False))

    current_df.to_sql('jobs', conn, if_exists='append', index=False)
    return surges

surges = detect_surges(df)

Tracking Technology Trends

tech_keywords = ['rust', 'golang', 'kubernetes', 'terraform', 'react', 'typescript', 
                 'python', 'pytorch', 'llm', 'rag', 'vector database', 'graphql']

tech_demand = {}
for tech in tech_keywords:
    count = df['title'].str.lower().str.contains(tech).sum()
    if count > 0:
        tech_demand[tech] = count

tech_df = pd.DataFrame(list(tech_demand.items()), columns=['technology', 'mentions'])
tech_df = tech_df.sort_values('mentions', ascending=False)
print("\nTechnology demand in VC-backed startups:")
print(tech_df.to_string(index=False))

Use Cases

Investors: Spot portfolio companies scaling before announcements
Job seekers: Find fast-growing startups hiring in your stack
Recruiters: Identify companies competing for the same talent
Analysts: Track sector-level hiring trends across VC portfolios

Monitor with ScrapeOps and use ScraperAPI for JavaScript-heavy job board rendering.

DEV Community