How to Build a VC Portfolio Tracker with Crunchbase Scraping

#python #tutorial #webdev #programming

Tracking venture capital investments, funding rounds, and startup valuations gives you a powerful lens into market trends. In this tutorial, we'll build a VC portfolio tracker by scraping Crunchbase data with Python.

What We're Building

A tool that:

Tracks funding rounds for specific companies or sectors
Monitors investor portfolios and new deals
Alerts on significant funding events
Stores historical data for trend analysis

Project Setup

pip install requests beautifulsoup4 pandas sqlite3

For scraping Crunchbase reliably, you'll need good proxy rotation. ScraperAPI handles JavaScript rendering and CAPTCHAs, which Crunchbase uses heavily.

The Scraper Core

import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime

SCRAPER_API_KEY = "YOUR_KEY"

def fetch_page(url):
    """Fetch a page through ScraperAPI with JS rendering."""
    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }
    response = requests.get("http://api.scraperapi.com", params=params)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def scrape_funding_rounds(company_slug):
    """Scrape funding round data for a company."""
    url = f"https://www.crunchbase.com/organization/{company_slug}/funding_rounds"
    soup = fetch_page(url)

    rounds = []
    for row in soup.select("table.funding-rounds tr")[1:]:
        cols = row.select("td")
        if len(cols) >= 4:
            rounds.append({
                "company": company_slug,
                "date": cols[0].text.strip(),
                "round_type": cols[1].text.strip(),
                "amount": cols[2].text.strip(),
                "lead_investor": cols[3].text.strip(),
                "scraped_at": datetime.now().isoformat()
            })
    return rounds

Database Storage

def init_db(db_path="vc_tracker.db"):
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS funding_rounds (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            company TEXT,
            date TEXT,
            round_type TEXT,
            amount TEXT,
            lead_investor TEXT,
            scraped_at TEXT,
            UNIQUE(company, date, round_type)
        )
    """)
    conn.commit()
    return conn

def store_rounds(conn, rounds):
    """Store rounds, returning only new ones."""
    new_rounds = []
    for r in rounds:
        try:
            conn.execute(
                """INSERT INTO funding_rounds
                (company, date, round_type, amount, lead_investor, scraped_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
                (r["company"], r["date"], r["round_type"],
                 r["amount"], r["lead_investor"], r["scraped_at"])
            )
            new_rounds.append(r)
        except sqlite3.IntegrityError:
            pass
    conn.commit()
    return new_rounds

Tracking Investor Portfolios

def scrape_investor_portfolio(investor_slug):
    """Scrape an investor's recent investments."""
    url = f"https://www.crunchbase.com/organization/{investor_slug}/recent_investments"
    soup = fetch_page(url)

    investments = []
    for card in soup.select(".investment-card"):
        company = card.select_one(".company-name")
        amount = card.select_one(".investment-amount")
        date = card.select_one(".investment-date")

        if company:
            investments.append({
                "investor": investor_slug,
                "company": company.text.strip(),
                "amount": amount.text.strip() if amount else "Undisclosed",
                "date": date.text.strip() if date else "Unknown"
            })
    return investments

Building the Monitoring Loop

import time

WATCHLIST = {
    "companies": ["openai", "anthropic", "mistral-ai", "cohere"],
    "investors": ["sequoia-capital", "a16z", "greylock-partners"]
}

def run_tracker():
    conn = init_db()

    for company in WATCHLIST["companies"]:
        try:
            rounds = scrape_funding_rounds(company)
            new = store_rounds(conn, rounds)
            if new:
                for r in new:
                    print(f"NEW: {r['company']} - {r['round_type']} - {r['amount']}")
        except Exception as e:
            print(f"Error scraping {company}: {e}")
        time.sleep(5)  # Be respectful

    for investor in WATCHLIST["investors"]:
        try:
            portfolio = scrape_investor_portfolio(investor)
            print(f"{investor}: {len(portfolio)} recent investments")
        except Exception as e:
            print(f"Error scraping {investor}: {e}")
        time.sleep(5)

if __name__ == "__main__":
    run_tracker()

Analysis and Trends

def analyze_sector_trends(conn, sector_keywords):
    """Analyze funding trends by sector."""
    df = pd.read_sql("SELECT * FROM funding_rounds", conn)

    # Parse amounts
    def parse_amount(amt):
        amt = amt.replace("$", "").replace(",", "").strip()
        multipliers = {"K": 1e3, "M": 1e6, "B": 1e9}
        for suffix, mult in multipliers.items():
            if amt.endswith(suffix):
                return float(amt[:-1]) * mult
        try:
            return float(amt)
        except ValueError:
            return 0

    df["amount_num"] = df["amount"].apply(parse_amount)

    print(f"Total rounds tracked: {len(df)}")
    print(f"Total funding: ${df['amount_num'].sum():,.0f}")
    print(f"Average round: ${df['amount_num'].mean():,.0f}")
    print(f"\nBy round type:")
    print(df.groupby("round_type")["amount_num"].agg(["count", "mean", "sum"]))

Proxy Tips for Crunchbase

Crunchbase has strong anti-bot measures. ScraperAPI with render=true handles most cases. For higher volume, consider ThorData residential proxies. Monitor your success rates with ScrapeOps.

Conclusion

With this tracker running daily, you'll spot funding trends before they make headlines. The combination of systematic scraping and SQL storage lets you build a proprietary dataset that most analysts don't have access to.

Always check Crunchbase's terms of service and use their official API where possible. Scraping should supplement, not replace, legitimate API access.

DEV Community