Tracking venture capital investments, funding rounds, and startup valuations gives you a powerful lens into market trends. In this tutorial, we'll build a VC portfolio tracker by scraping Crunchbase data with Python.
What We're Building
A tool that:
- Tracks funding rounds for specific companies or sectors
- Monitors investor portfolios and new deals
- Alerts on significant funding events
- Stores historical data for trend analysis
Project Setup
pip install requests beautifulsoup4 pandas sqlite3
For scraping Crunchbase reliably, you'll need good proxy rotation. ScraperAPI handles JavaScript rendering and CAPTCHAs, which Crunchbase uses heavily.
The Scraper Core
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
SCRAPER_API_KEY = "YOUR_KEY"
def fetch_page(url):
"""Fetch a page through ScraperAPI with JS rendering."""
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get("http://api.scraperapi.com", params=params)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
def scrape_funding_rounds(company_slug):
"""Scrape funding round data for a company."""
url = f"https://www.crunchbase.com/organization/{company_slug}/funding_rounds"
soup = fetch_page(url)
rounds = []
for row in soup.select("table.funding-rounds tr")[1:]:
cols = row.select("td")
if len(cols) >= 4:
rounds.append({
"company": company_slug,
"date": cols[0].text.strip(),
"round_type": cols[1].text.strip(),
"amount": cols[2].text.strip(),
"lead_investor": cols[3].text.strip(),
"scraped_at": datetime.now().isoformat()
})
return rounds
Database Storage
def init_db(db_path="vc_tracker.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS funding_rounds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company TEXT,
date TEXT,
round_type TEXT,
amount TEXT,
lead_investor TEXT,
scraped_at TEXT,
UNIQUE(company, date, round_type)
)
""")
conn.commit()
return conn
def store_rounds(conn, rounds):
"""Store rounds, returning only new ones."""
new_rounds = []
for r in rounds:
try:
conn.execute(
"""INSERT INTO funding_rounds
(company, date, round_type, amount, lead_investor, scraped_at)
VALUES (?, ?, ?, ?, ?, ?)""",
(r["company"], r["date"], r["round_type"],
r["amount"], r["lead_investor"], r["scraped_at"])
)
new_rounds.append(r)
except sqlite3.IntegrityError:
pass
conn.commit()
return new_rounds
Tracking Investor Portfolios
def scrape_investor_portfolio(investor_slug):
"""Scrape an investor's recent investments."""
url = f"https://www.crunchbase.com/organization/{investor_slug}/recent_investments"
soup = fetch_page(url)
investments = []
for card in soup.select(".investment-card"):
company = card.select_one(".company-name")
amount = card.select_one(".investment-amount")
date = card.select_one(".investment-date")
if company:
investments.append({
"investor": investor_slug,
"company": company.text.strip(),
"amount": amount.text.strip() if amount else "Undisclosed",
"date": date.text.strip() if date else "Unknown"
})
return investments
Building the Monitoring Loop
import time
WATCHLIST = {
"companies": ["openai", "anthropic", "mistral-ai", "cohere"],
"investors": ["sequoia-capital", "a16z", "greylock-partners"]
}
def run_tracker():
conn = init_db()
for company in WATCHLIST["companies"]:
try:
rounds = scrape_funding_rounds(company)
new = store_rounds(conn, rounds)
if new:
for r in new:
print(f"NEW: {r['company']} - {r['round_type']} - {r['amount']}")
except Exception as e:
print(f"Error scraping {company}: {e}")
time.sleep(5) # Be respectful
for investor in WATCHLIST["investors"]:
try:
portfolio = scrape_investor_portfolio(investor)
print(f"{investor}: {len(portfolio)} recent investments")
except Exception as e:
print(f"Error scraping {investor}: {e}")
time.sleep(5)
if __name__ == "__main__":
run_tracker()
Analysis and Trends
def analyze_sector_trends(conn, sector_keywords):
"""Analyze funding trends by sector."""
df = pd.read_sql("SELECT * FROM funding_rounds", conn)
# Parse amounts
def parse_amount(amt):
amt = amt.replace("$", "").replace(",", "").strip()
multipliers = {"K": 1e3, "M": 1e6, "B": 1e9}
for suffix, mult in multipliers.items():
if amt.endswith(suffix):
return float(amt[:-1]) * mult
try:
return float(amt)
except ValueError:
return 0
df["amount_num"] = df["amount"].apply(parse_amount)
print(f"Total rounds tracked: {len(df)}")
print(f"Total funding: ${df['amount_num'].sum():,.0f}")
print(f"Average round: ${df['amount_num'].mean():,.0f}")
print(f"\nBy round type:")
print(df.groupby("round_type")["amount_num"].agg(["count", "mean", "sum"]))
Proxy Tips for Crunchbase
Crunchbase has strong anti-bot measures. ScraperAPI with render=true handles most cases. For higher volume, consider ThorData residential proxies. Monitor your success rates with ScrapeOps.
Conclusion
With this tracker running daily, you'll spot funding trends before they make headlines. The combination of systematic scraping and SQL storage lets you build a proprietary dataset that most analysts don't have access to.
Always check Crunchbase's terms of service and use their official API where possible. Scraping should supplement, not replace, legitimate API access.
Top comments (0)