DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Crunchbase: Startup Funding and Investor Data

Crunchbase is the go-to database for startup funding data. Whether you're a VC analyst or founder doing competitive research, Crunchbase data is invaluable.

Using Crunchbase's API

import requests

class CrunchbaseScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def search(self, query, limit=25):
        resp = requests.get(f'{self.base}/autocompletes',
            headers=self.headers,
            params={'query': query, 'collection_ids': 'organizations', 'limit': limit})
        return [{
            'name': e['identifier']['value'],
            'permalink': e['identifier']['permalink']
        } for e in resp.json().get('entities', [])]

    def get_org(self, permalink):
        resp = requests.get(f'{self.base}/entities/organizations/{permalink}',
            headers=self.headers,
            params={'field_ids': ['short_description', 'founded_on', 'funding_total', 'website_url']})
        p = resp.json().get('properties', {})
        return {
            'description': p.get('short_description'),
            'founded': p.get('founded_on'),
            'funding': p.get('funding_total', {}),
            'website': p.get('website_url')
        }
Enter fullscreen mode Exit fullscreen mode

Scraping Funding Rounds

class FundingScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def get_rounds(self, permalink):
        resp = requests.get(
            f'{self.base}/entities/organizations/{permalink}/funding_rounds',
            headers=self.headers,
            params={'field_ids': ['announced_on', 'money_raised', 'investment_type', 'investor_identifiers']})
        rounds = []
        for item in resp.json().get('entities', []):
            p = item.get('properties', {})
            money = p.get('money_raised', {})
            rounds.append({
                'date': p.get('announced_on'),
                'type': p.get('investment_type'),
                'amount': money.get('value'),
                'investors': [i.get('value') for i in p.get('investor_identifiers', [])]
            })
        return sorted(rounds, key=lambda x: x['date'] or '', reverse=True)

    def recent_rounds(self, min_amount=1000000):
        resp = requests.post(f'{self.base}/searches/funding_rounds',
            headers=self.headers, json={
                'field_ids': ['announced_on', 'money_raised', 'funded_organization_identifier'],
                'order': [{'field_id': 'announced_on', 'sort': 'desc'}],
                'query': [{'type': 'predicate', 'field_id': 'money_raised',
                    'operator_id': 'gte', 'values': [{'value': min_amount, 'currency': 'USD'}]}],
                'limit': 50
            })
        return [{
            'company': e['properties'].get('funded_organization_identifier', {}).get('value'),
            'date': e['properties'].get('announced_on'),
            'amount': e['properties'].get('money_raised', {}).get('value')
        } for e in resp.json().get('entities', [])]
Enter fullscreen mode Exit fullscreen mode

Investor Portfolio Tracker

class InvestorTracker:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://api.crunchbase.com/api/v4'
        self.headers = {'X-cb-user-key': self.key}

    def portfolio(self, permalink):
        resp = requests.get(
            f'{self.base}/entities/organizations/{permalink}/participated_investments',
            headers=self.headers,
            params={'field_ids': ['organization_identifier', 'announced_on']})
        return [{
            'company': i['properties'].get('organization_identifier', {}).get('value'),
            'date': i['properties'].get('announced_on')
        } for i in resp.json().get('entities', [])]

    def find_coinvestments(self, permalinks):
        portfolios = {p: set(i['company'] for i in self.portfolio(p) if i['company']) for p in permalinks}
        for i, p1 in enumerate(permalinks):
            for p2 in permalinks[i+1:]:
                shared = portfolios[p1] & portfolios[p2]
                if shared: print(f"{p1} & {p2} co-invested in: {shared}")
Enter fullscreen mode Exit fullscreen mode

Data Export

import pandas as pd

def export_report(rounds, filename='funding.csv'):
    df = pd.DataFrame(rounds)
    if 'amount' in df.columns:
        df['amount_m'] = df['amount'].apply(lambda x: round(x/1e6, 2) if x else None)
    df.to_csv(filename, index=False)
    print(f"Exported {len(df)} rounds")
Enter fullscreen mode Exit fullscreen mode

Scaling

ScraperAPI manages proxy rotation. ThorData provides residential proxies, and ScrapeOps monitors pipelines.

Conclusion

Start with the official API, respect rate limits, and build incrementally for comprehensive startup ecosystem intelligence.

Top comments (0)