Scraping VC Portfolios to Track Investment Thesis Shifts

#python #tutorial #webdev #programming

Why Track VC Portfolios

Venture capital firms signal their investment thesis through their portfolios. When a16z makes three AI infrastructure investments in a month, it reveals where smart money is flowing.

Lets build a scraper that tracks VC portfolio changes over time and detects thesis shifts.

What We Monitor

New portfolio additions and removals
Investment stage patterns (seed, Series A, growth)
Sector concentration shifts
Co-investment patterns between firms

Setup

pip install requests beautifulsoup4 pandas

VC firm websites use modern JavaScript frameworks. ScraperAPI renders these pages fully before extraction.

Scraping VC Portfolio Pages

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import json
import os
import time

SCRAPER_API_KEY = "YOUR_KEY"

VC_FIRMS = {
    "a16z": "https://a16z.com/portfolio/",
    "sequoia": "https://www.sequoiacap.com/our-companies/",
    "greylock": "https://greylock.com/portfolio/",
    "benchmark": "https://www.benchmark.com/portfolio/"
}

def scrape_portfolio(firm_name, url):
    api_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}&render=true"
    response = requests.get(api_url)
    soup = BeautifulSoup(response.text, "html.parser")

    companies = []
    for card in soup.select(".portfolio-card, .company-card, [class*=portfolio]"):
        name = card.select_one("h3, h2, .company-name")
        sector = card.select_one(".sector, .category")
        desc = card.select_one("p, .description")
        if name:
            companies.append({
                "firm": firm_name,
                "company": name.text.strip(),
                "sector": sector.text.strip() if sector else "Unknown",
                "description": desc.text.strip()[:200] if desc else "",
                "scraped_at": datetime.now().isoformat()
            })
    return companies

Tracking Portfolio Changes

HISTORY_FILE = "vc_portfolio_history.json"

def detect_portfolio_changes(firm, current_companies):
    history = {}
    if os.path.exists(HISTORY_FILE):
        with open(HISTORY_FILE) as f:
            history = json.load(f)

    prev_names = set(history.get(firm, {}).get("companies", []))
    curr_names = set(c["company"] for c in current_companies)

    new_additions = curr_names - prev_names
    removals = prev_names - curr_names

    history[firm] = {
        "companies": list(curr_names),
        "last_updated": datetime.now().isoformat()
    }
    with open(HISTORY_FILE, "w") as f:
        json.dump(history, f, indent=2)

    return {
        "firm": firm,
        "new": list(new_additions),
        "removed": list(removals),
        "total": len(curr_names)
    }

Sector Concentration Analysis

def analyze_thesis_shifts(all_portfolios):
    df = pd.DataFrame(all_portfolios)
    sector_counts = df.groupby(["firm", "sector"]).size().reset_index(name="count")
    totals = sector_counts.groupby("firm")["count"].transform("sum")
    sector_counts["percentage"] = (sector_counts["count"] / totals * 100).round(1)

    print("Sector Concentration by Firm")
    for firm in sector_counts["firm"].unique():
        data = sector_counts[sector_counts["firm"] == firm].sort_values("percentage", ascending=False)
        print(f"\n{firm}:")
        for _, row in data.head(5).iterrows():
            bar = "#" * int(row["percentage"] / 2)
            print(f"  {row['sector']}: {row['percentage']}% {bar}")
    return sector_counts

Co-Investment Detection

def find_coinvestments(all_portfolios):
    df = pd.DataFrame(all_portfolios)
    company_firms = df.groupby("company")["firm"].apply(list).reset_index()
    coinvested = company_firms[company_firms["firm"].apply(len) > 1]

    print(f"\nCo-invested companies: {len(coinvested)}")
    for _, row in coinvested.iterrows():
        print(f"  {row['company']}: {', '.join(row['firm'])}")
    return coinvested

Full Pipeline

def run_vc_tracker():
    all_companies = []
    changes_summary = []

    for firm, url in VC_FIRMS.items():
        try:
            print(f"Scraping {firm}...")
            companies = scrape_portfolio(firm, url)
            all_companies.extend(companies)
            changes = detect_portfolio_changes(firm, companies)
            changes_summary.append(changes)

            if changes["new"]:
                print(f"  NEW investments: {', '.join(changes['new'])}")
            if changes["removed"]:
                print(f"  REMOVED: {', '.join(changes['removed'])}")
            time.sleep(2)
        except Exception as e:
            print(f"Error with {firm}: {e}")

    analyze_thesis_shifts(all_companies)
    find_coinvestments(all_companies)

    df = pd.DataFrame(all_companies)
    df.to_csv("vc_portfolios.csv", index=False)
    print(f"\nSaved {len(df)} companies to vc_portfolios.csv")
    return all_companies

run_vc_tracker()

Proxy Strategy

VC sites block scrapers aggressively. Use ThorData residential proxies for reliable access. Monitor your success rates with ScrapeOps to catch when sites change their structure.

Conclusion

Tracking VC portfolios reveals investment thesis shifts before they become obvious. With ScraperAPI and Python, you can build institutional-grade investment intelligence for free.

DEV Community