Why Track VC Portfolios
Venture capital firms signal their investment thesis through their portfolios. When a16z makes three AI infrastructure investments in a month, it reveals where smart money is flowing.
Lets build a scraper that tracks VC portfolio changes over time and detects thesis shifts.
What We Monitor
- New portfolio additions and removals
- Investment stage patterns (seed, Series A, growth)
- Sector concentration shifts
- Co-investment patterns between firms
Setup
pip install requests beautifulsoup4 pandas
VC firm websites use modern JavaScript frameworks. ScraperAPI renders these pages fully before extraction.
Scraping VC Portfolio Pages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import json
import os
import time
SCRAPER_API_KEY = "YOUR_KEY"
VC_FIRMS = {
"a16z": "https://a16z.com/portfolio/",
"sequoia": "https://www.sequoiacap.com/our-companies/",
"greylock": "https://greylock.com/portfolio/",
"benchmark": "https://www.benchmark.com/portfolio/"
}
def scrape_portfolio(firm_name, url):
api_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}&render=true"
response = requests.get(api_url)
soup = BeautifulSoup(response.text, "html.parser")
companies = []
for card in soup.select(".portfolio-card, .company-card, [class*=portfolio]"):
name = card.select_one("h3, h2, .company-name")
sector = card.select_one(".sector, .category")
desc = card.select_one("p, .description")
if name:
companies.append({
"firm": firm_name,
"company": name.text.strip(),
"sector": sector.text.strip() if sector else "Unknown",
"description": desc.text.strip()[:200] if desc else "",
"scraped_at": datetime.now().isoformat()
})
return companies
Tracking Portfolio Changes
HISTORY_FILE = "vc_portfolio_history.json"
def detect_portfolio_changes(firm, current_companies):
history = {}
if os.path.exists(HISTORY_FILE):
with open(HISTORY_FILE) as f:
history = json.load(f)
prev_names = set(history.get(firm, {}).get("companies", []))
curr_names = set(c["company"] for c in current_companies)
new_additions = curr_names - prev_names
removals = prev_names - curr_names
history[firm] = {
"companies": list(curr_names),
"last_updated": datetime.now().isoformat()
}
with open(HISTORY_FILE, "w") as f:
json.dump(history, f, indent=2)
return {
"firm": firm,
"new": list(new_additions),
"removed": list(removals),
"total": len(curr_names)
}
Sector Concentration Analysis
def analyze_thesis_shifts(all_portfolios):
df = pd.DataFrame(all_portfolios)
sector_counts = df.groupby(["firm", "sector"]).size().reset_index(name="count")
totals = sector_counts.groupby("firm")["count"].transform("sum")
sector_counts["percentage"] = (sector_counts["count"] / totals * 100).round(1)
print("Sector Concentration by Firm")
for firm in sector_counts["firm"].unique():
data = sector_counts[sector_counts["firm"] == firm].sort_values("percentage", ascending=False)
print(f"\n{firm}:")
for _, row in data.head(5).iterrows():
bar = "#" * int(row["percentage"] / 2)
print(f" {row['sector']}: {row['percentage']}% {bar}")
return sector_counts
Co-Investment Detection
def find_coinvestments(all_portfolios):
df = pd.DataFrame(all_portfolios)
company_firms = df.groupby("company")["firm"].apply(list).reset_index()
coinvested = company_firms[company_firms["firm"].apply(len) > 1]
print(f"\nCo-invested companies: {len(coinvested)}")
for _, row in coinvested.iterrows():
print(f" {row['company']}: {', '.join(row['firm'])}")
return coinvested
Full Pipeline
def run_vc_tracker():
all_companies = []
changes_summary = []
for firm, url in VC_FIRMS.items():
try:
print(f"Scraping {firm}...")
companies = scrape_portfolio(firm, url)
all_companies.extend(companies)
changes = detect_portfolio_changes(firm, companies)
changes_summary.append(changes)
if changes["new"]:
print(f" NEW investments: {', '.join(changes['new'])}")
if changes["removed"]:
print(f" REMOVED: {', '.join(changes['removed'])}")
time.sleep(2)
except Exception as e:
print(f"Error with {firm}: {e}")
analyze_thesis_shifts(all_companies)
find_coinvestments(all_companies)
df = pd.DataFrame(all_companies)
df.to_csv("vc_portfolios.csv", index=False)
print(f"\nSaved {len(df)} companies to vc_portfolios.csv")
return all_companies
run_vc_tracker()
Proxy Strategy
VC sites block scrapers aggressively. Use ThorData residential proxies for reliable access. Monitor your success rates with ScrapeOps to catch when sites change their structure.
Conclusion
Tracking VC portfolios reveals investment thesis shifts before they become obvious. With ScraperAPI and Python, you can build institutional-grade investment intelligence for free.
Top comments (0)