DEV Community

agenthustler
agenthustler

Posted on

Scraping VC Portfolios to Track Investment Thesis Shifts

Why Track VC Portfolios

Venture capital firms signal their investment thesis through their portfolios. When a16z makes three AI infrastructure investments in a month, it reveals where smart money is flowing.

Lets build a scraper that tracks VC portfolio changes over time and detects thesis shifts.

What We Monitor

  • New portfolio additions and removals
  • Investment stage patterns (seed, Series A, growth)
  • Sector concentration shifts
  • Co-investment patterns between firms

Setup

pip install requests beautifulsoup4 pandas
Enter fullscreen mode Exit fullscreen mode

VC firm websites use modern JavaScript frameworks. ScraperAPI renders these pages fully before extraction.

Scraping VC Portfolio Pages

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import json
import os
import time

SCRAPER_API_KEY = "YOUR_KEY"

VC_FIRMS = {
    "a16z": "https://a16z.com/portfolio/",
    "sequoia": "https://www.sequoiacap.com/our-companies/",
    "greylock": "https://greylock.com/portfolio/",
    "benchmark": "https://www.benchmark.com/portfolio/"
}

def scrape_portfolio(firm_name, url):
    api_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}&render=true"
    response = requests.get(api_url)
    soup = BeautifulSoup(response.text, "html.parser")

    companies = []
    for card in soup.select(".portfolio-card, .company-card, [class*=portfolio]"):
        name = card.select_one("h3, h2, .company-name")
        sector = card.select_one(".sector, .category")
        desc = card.select_one("p, .description")
        if name:
            companies.append({
                "firm": firm_name,
                "company": name.text.strip(),
                "sector": sector.text.strip() if sector else "Unknown",
                "description": desc.text.strip()[:200] if desc else "",
                "scraped_at": datetime.now().isoformat()
            })
    return companies
Enter fullscreen mode Exit fullscreen mode

Tracking Portfolio Changes

HISTORY_FILE = "vc_portfolio_history.json"

def detect_portfolio_changes(firm, current_companies):
    history = {}
    if os.path.exists(HISTORY_FILE):
        with open(HISTORY_FILE) as f:
            history = json.load(f)

    prev_names = set(history.get(firm, {}).get("companies", []))
    curr_names = set(c["company"] for c in current_companies)

    new_additions = curr_names - prev_names
    removals = prev_names - curr_names

    history[firm] = {
        "companies": list(curr_names),
        "last_updated": datetime.now().isoformat()
    }
    with open(HISTORY_FILE, "w") as f:
        json.dump(history, f, indent=2)

    return {
        "firm": firm,
        "new": list(new_additions),
        "removed": list(removals),
        "total": len(curr_names)
    }
Enter fullscreen mode Exit fullscreen mode

Sector Concentration Analysis

def analyze_thesis_shifts(all_portfolios):
    df = pd.DataFrame(all_portfolios)
    sector_counts = df.groupby(["firm", "sector"]).size().reset_index(name="count")
    totals = sector_counts.groupby("firm")["count"].transform("sum")
    sector_counts["percentage"] = (sector_counts["count"] / totals * 100).round(1)

    print("Sector Concentration by Firm")
    for firm in sector_counts["firm"].unique():
        data = sector_counts[sector_counts["firm"] == firm].sort_values("percentage", ascending=False)
        print(f"\n{firm}:")
        for _, row in data.head(5).iterrows():
            bar = "#" * int(row["percentage"] / 2)
            print(f"  {row['sector']}: {row['percentage']}% {bar}")
    return sector_counts
Enter fullscreen mode Exit fullscreen mode

Co-Investment Detection

def find_coinvestments(all_portfolios):
    df = pd.DataFrame(all_portfolios)
    company_firms = df.groupby("company")["firm"].apply(list).reset_index()
    coinvested = company_firms[company_firms["firm"].apply(len) > 1]

    print(f"\nCo-invested companies: {len(coinvested)}")
    for _, row in coinvested.iterrows():
        print(f"  {row['company']}: {', '.join(row['firm'])}")
    return coinvested
Enter fullscreen mode Exit fullscreen mode

Full Pipeline

def run_vc_tracker():
    all_companies = []
    changes_summary = []

    for firm, url in VC_FIRMS.items():
        try:
            print(f"Scraping {firm}...")
            companies = scrape_portfolio(firm, url)
            all_companies.extend(companies)
            changes = detect_portfolio_changes(firm, companies)
            changes_summary.append(changes)

            if changes["new"]:
                print(f"  NEW investments: {', '.join(changes['new'])}")
            if changes["removed"]:
                print(f"  REMOVED: {', '.join(changes['removed'])}")
            time.sleep(2)
        except Exception as e:
            print(f"Error with {firm}: {e}")

    analyze_thesis_shifts(all_companies)
    find_coinvestments(all_companies)

    df = pd.DataFrame(all_companies)
    df.to_csv("vc_portfolios.csv", index=False)
    print(f"\nSaved {len(df)} companies to vc_portfolios.csv")
    return all_companies

run_vc_tracker()
Enter fullscreen mode Exit fullscreen mode

Proxy Strategy

VC sites block scrapers aggressively. Use ThorData residential proxies for reliable access. Monitor your success rates with ScrapeOps to catch when sites change their structure.

Conclusion

Tracking VC portfolios reveals investment thesis shifts before they become obvious. With ScraperAPI and Python, you can build institutional-grade investment intelligence for free.

Top comments (0)