How to Scrape Crunchbase: Startup Funding and Investor Data

#python #tutorial #webdev #programming

Crunchbase is the go-to database for startup funding data. Whether you're a VC analyst or founder doing competitive research, Crunchbase data is invaluable.

Using Crunchbase's API

import requests

class CrunchbaseScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def search(self, query, limit=25):
        resp = requests.get(f'{self.base}/autocompletes',
            headers=self.headers,
            params={'query': query, 'collection_ids': 'organizations', 'limit': limit})
        return [{
            'name': e['identifier']['value'],
            'permalink': e['identifier']['permalink']
        } for e in resp.json().get('entities', [])]

    def get_org(self, permalink):
        resp = requests.get(f'{self.base}/entities/organizations/{permalink}',
            headers=self.headers,
            params={'field_ids': ['short_description', 'founded_on', 'funding_total', 'website_url']})
        p = resp.json().get('properties', {})
        return {
            'description': p.get('short_description'),
            'founded': p.get('founded_on'),
            'funding': p.get('funding_total', {}),
            'website': p.get('website_url')
        }

Scraping Funding Rounds

class FundingScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def get_rounds(self, permalink):
        resp = requests.get(
            f'{self.base}/entities/organizations/{permalink}/funding_rounds',
            headers=self.headers,
            params={'field_ids': ['announced_on', 'money_raised', 'investment_type', 'investor_identifiers']})
        rounds = []
        for item in resp.json().get('entities', []):
            p = item.get('properties', {})
            money = p.get('money_raised', {})
            rounds.append({
                'date': p.get('announced_on'),
                'type': p.get('investment_type'),
                'amount': money.get('value'),
                'investors': [i.get('value') for i in p.get('investor_identifiers', [])]
            })
        return sorted(rounds, key=lambda x: x['date'] or '', reverse=True)

    def recent_rounds(self, min_amount=1000000):
        resp = requests.post(f'{self.base}/searches/funding_rounds',
            headers=self.headers, json={
                'field_ids': ['announced_on', 'money_raised', 'funded_organization_identifier'],
                'order': [{'field_id': 'announced_on', 'sort': 'desc'}],
                'query': [{'type': 'predicate', 'field_id': 'money_raised',
                    'operator_id': 'gte', 'values': [{'value': min_amount, 'currency': 'USD'}]}],
                'limit': 50
            })
        return [{
            'company': e['properties'].get('funded_organization_identifier', {}).get('value'),
            'date': e['properties'].get('announced_on'),
            'amount': e['properties'].get('money_raised', {}).get('value')
        } for e in resp.json().get('entities', [])]

Investor Portfolio Tracker

class InvestorTracker:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def portfolio(self, permalink):
        resp = requests.get(
            f'{self.base}/entities/organizations/{permalink}/participated_investments',
            headers=self.headers,
            params={'field_ids': ['organization_identifier', 'announced_on']})
        return [{
            'company': i['properties'].get('organization_identifier', {}).get('value'),
            'date': i['properties'].get('announced_on')
        } for i in resp.json().get('entities', [])]

    def find_coinvestments(self, permalinks):
        portfolios = {p: set(i['company'] for i in self.portfolio(p) if i['company']) for p in permalinks}
        for i, p1 in enumerate(permalinks):
            for p2 in permalinks[i+1:]:
                shared = portfolios[p1] & portfolios[p2]
                if shared: print(f"{p1} & {p2} co-invested in: {shared}")

Data Export

import pandas as pd

def export_report(rounds, filename='funding.csv'):
    df = pd.DataFrame(rounds)
    if 'amount' in df.columns:
        df['amount_m'] = df['amount'].apply(lambda x: round(x/1e6, 2) if x else None)
    df.to_csv(filename, index=False)
    print(f"Exported {len(df)} rounds")

Scaling

ScraperAPI manages proxy rotation. ThorData provides residential proxies, and ScrapeOps monitors pipelines.

Conclusion

Start with the official API, respect rate limits, and build incrementally for comprehensive startup ecosystem intelligence.

DEV Community