DEV Community

agenthustler
agenthustler

Posted on

Crunchbase Funding Data: Track Startup Investments with Python

Crunchbase is the definitive source for startup funding data — investment rounds, investors, acquisitions, and company profiles. Tracking this data helps investors, analysts, and founders understand market dynamics and spot emerging trends.

What Funding Data Can You Extract?

  • Funding rounds (seed, Series A-F, IPO)
  • Investor names and portfolios
  • Company valuations and revenue estimates
  • Acquisition data and exit events
  • Founder and executive profiles
  • Industry and geographic trends

Scraping Crunchbase Company Data

import requests
from bs4 import BeautifulSoup
import json
import time
import re

class CrunchbaseScraper:
    BASE_URL = "https://www.crunchbase.com"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
            'Accept': 'text/html,application/xhtml+xml',
        })

    def get_company_profile(self, company_slug):
        """Get company overview from Crunchbase."""
        url = f"{self.BASE_URL}/organization/{company_slug}"
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Extract structured data
        scripts = soup.find_all('script', type='application/ld+json')
        ld_data = {}
        for script in scripts:
            try:
                ld_data = json.loads(script.string)
                break
            except (json.JSONDecodeError, TypeError):
                continue

        # Extract key metrics
        profile = {
            'name': ld_data.get('name', ''),
            'description': ld_data.get('description', ''),
            'url': ld_data.get('url', ''),
            'founded': ld_data.get('foundingDate', ''),
        }

        # Parse funding info from page
        funding_section = soup.select_one('[class*="funding"]')
        if funding_section:
            profile['funding_text'] = funding_section.get_text(strip=True)

        return profile

    def search_companies(self, query):
        """Search for companies by name or keyword."""
        url = f"{self.BASE_URL}/textsearch"
        params = {'q': query}
        resp = self.session.get(url, params=params)
        soup = BeautifulSoup(resp.text, 'html.parser')

        results = []
        for item in soup.select('[class*="search-result"]'):
            name = item.select_one('[class*="name"]')
            desc = item.select_one('[class*="description"]')
            link = item.select_one('a')

            if name:
                results.append({
                    'name': name.get_text(strip=True),
                    'description': desc.get_text(strip=True) if desc else '',
                    'url': self.BASE_URL + link['href'] if link else '',
                })

        return results
Enter fullscreen mode Exit fullscreen mode

Building a Funding Tracker

import pandas as pd
from datetime import datetime

class FundingTracker:
    def __init__(self, scraper, db_file='funding_data.csv'):
        self.scraper = scraper
        self.db_file = db_file

    def track_companies(self, company_slugs):
        """Collect and store funding data for multiple companies."""
        records = []
        for slug in company_slugs:
            try:
                profile = self.scraper.get_company_profile(slug)
                profile['slug'] = slug
                profile['collected_at'] = datetime.now().isoformat()
                records.append(profile)
                print(f"  Collected: {profile.get('name', slug)}")
            except Exception as e:
                print(f"  Error with {slug}: {e}")
            time.sleep(3)  # Respect rate limits

        df = pd.DataFrame(records)
        df.to_csv(self.db_file, mode='a', header=False, index=False)
        return df

    def analyze_sector(self, sector_companies):
        """Analyze funding patterns in a sector."""
        df = pd.read_csv(self.db_file)
        sector_df = df[df['slug'].isin(sector_companies)]

        print(f"Sector analysis ({len(sector_df)} companies):")
        print(f"  Companies tracked: {sector_df['name'].nunique()}")

        return sector_df
Enter fullscreen mode Exit fullscreen mode

Investment Pattern Analysis

def analyze_investment_patterns(funding_data):
    """Find patterns in startup funding data."""
    df = pd.DataFrame(funding_data)

    if 'funding_amount' in df.columns:
        # Average round size by stage
        stage_avg = df.groupby('stage')['funding_amount'].agg(['mean', 'median', 'count'])
        print("Funding by stage:")
        print(stage_avg.to_string())

        # Top investors by deal count
        if 'investors' in df.columns:
            all_investors = []
            for inv_list in df['investors'].dropna():
                if isinstance(inv_list, str):
                    all_investors.extend(inv_list.split(', '))

            from collections import Counter
            top_investors = Counter(all_investors).most_common(15)
            print("\nMost active investors:")
            for investor, count in top_investors:
                print(f"  {investor}: {count} deals")

    return df

def detect_hot_sectors(funding_data, window_days=90):
    """Identify sectors with increasing funding activity."""
    df = pd.DataFrame(funding_data)
    df['date'] = pd.to_datetime(df.get('date', datetime.now()))

    recent = df[df['date'] > datetime.now() - pd.Timedelta(days=window_days)]

    if 'category' in recent.columns:
        sector_activity = recent.groupby('category').agg(
            deals=('name', 'count'),
            total_funding=('funding_amount', 'sum')
        ).sort_values('deals', ascending=False)

        print(f"Hot sectors (last {window_days} days):")
        print(sector_activity.head(10).to_string())

        return sector_activity

    return pd.DataFrame()
Enter fullscreen mode Exit fullscreen mode

Building Investor Intelligence

def build_investor_profile(funding_data, investor_name):
    """Build a profile of an investor's activity."""
    df = pd.DataFrame(funding_data)

    # Filter deals involving this investor
    investor_deals = df[
        df.get('investors', '').str.contains(investor_name, na=False, case=False)
    ]

    if investor_deals.empty:
        print(f"No deals found for {investor_name}")
        return {}

    profile = {
        'name': investor_name,
        'total_deals': len(investor_deals),
        'companies': investor_deals['name'].tolist(),
    }

    if 'stage' in investor_deals.columns:
        profile['preferred_stages'] = investor_deals['stage'].value_counts().to_dict()

    if 'category' in investor_deals.columns:
        profile['preferred_sectors'] = investor_deals['category'].value_counts().to_dict()

    print(f"Investor: {investor_name}")
    print(f"  Total deals: {profile['total_deals']}")
    print(f"  Recent companies: {', '.join(profile['companies'][:5])}")

    return profile
Enter fullscreen mode Exit fullscreen mode

Scaling with Cloud Solutions

For comprehensive Crunchbase data covering thousands of companies, the Crunchbase Scraper on Apify handles anti-bot measures and data structuring at scale.

Proxy rotation is essential for Crunchbase scraping — ScrapeOps provides intelligent proxy management and request monitoring.

Conclusion

Crunchbase funding data reveals the hidden dynamics of startup ecosystems — where money flows, which sectors heat up, and which investors lead trends. Build your pipeline to track companies, analyze patterns, and detect emerging sectors. Start with targeted company tracking, then scale to sector-wide monitoring for comprehensive investment intelligence.

Top comments (0)