Crunchbase is the go-to database for startup funding data. Whether you're a VC analyst or founder doing competitive research, Crunchbase data is invaluable.
Using Crunchbase's API
import requests
class CrunchbaseScraper:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://api.crunchbase.com/api/v4'
self.headers = {'X-cb-user-key': self.key}
def search(self, query, limit=25):
resp = requests.get(f'{self.base}/autocompletes',
headers=self.headers,
params={'query': query, 'collection_ids': 'organizations', 'limit': limit})
return [{
'name': e['identifier']['value'],
'permalink': e['identifier']['permalink']
} for e in resp.json().get('entities', [])]
def get_org(self, permalink):
resp = requests.get(f'{self.base}/entities/organizations/{permalink}',
headers=self.headers,
params={'field_ids': ['short_description', 'founded_on', 'funding_total', 'website_url']})
p = resp.json().get('properties', {})
return {
'description': p.get('short_description'),
'founded': p.get('founded_on'),
'funding': p.get('funding_total', {}),
'website': p.get('website_url')
}
Scraping Funding Rounds
class FundingScraper:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://api.crunchbase.com/api/v4'
self.headers = {'X-cb-user-key': self.key}
def get_rounds(self, permalink):
resp = requests.get(
f'{self.base}/entities/organizations/{permalink}/funding_rounds',
headers=self.headers,
params={'field_ids': ['announced_on', 'money_raised', 'investment_type', 'investor_identifiers']})
rounds = []
for item in resp.json().get('entities', []):
p = item.get('properties', {})
money = p.get('money_raised', {})
rounds.append({
'date': p.get('announced_on'),
'type': p.get('investment_type'),
'amount': money.get('value'),
'investors': [i.get('value') for i in p.get('investor_identifiers', [])]
})
return sorted(rounds, key=lambda x: x['date'] or '', reverse=True)
def recent_rounds(self, min_amount=1000000):
resp = requests.post(f'{self.base}/searches/funding_rounds',
headers=self.headers, json={
'field_ids': ['announced_on', 'money_raised', 'funded_organization_identifier'],
'order': [{'field_id': 'announced_on', 'sort': 'desc'}],
'query': [{'type': 'predicate', 'field_id': 'money_raised',
'operator_id': 'gte', 'values': [{'value': min_amount, 'currency': 'USD'}]}],
'limit': 50
})
return [{
'company': e['properties'].get('funded_organization_identifier', {}).get('value'),
'date': e['properties'].get('announced_on'),
'amount': e['properties'].get('money_raised', {}).get('value')
} for e in resp.json().get('entities', [])]
Investor Portfolio Tracker
class InvestorTracker:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://api.crunchbase.com/api/v4'
self.headers = {'X-cb-user-key': self.key}
def portfolio(self, permalink):
resp = requests.get(
f'{self.base}/entities/organizations/{permalink}/participated_investments',
headers=self.headers,
params={'field_ids': ['organization_identifier', 'announced_on']})
return [{
'company': i['properties'].get('organization_identifier', {}).get('value'),
'date': i['properties'].get('announced_on')
} for i in resp.json().get('entities', [])]
def find_coinvestments(self, permalinks):
portfolios = {p: set(i['company'] for i in self.portfolio(p) if i['company']) for p in permalinks}
for i, p1 in enumerate(permalinks):
for p2 in permalinks[i+1:]:
shared = portfolios[p1] & portfolios[p2]
if shared: print(f"{p1} & {p2} co-invested in: {shared}")
Data Export
import pandas as pd
def export_report(rounds, filename='funding.csv'):
df = pd.DataFrame(rounds)
if 'amount' in df.columns:
df['amount_m'] = df['amount'].apply(lambda x: round(x/1e6, 2) if x else None)
df.to_csv(filename, index=False)
print(f"Exported {len(df)} rounds")
Scaling
ScraperAPI manages proxy rotation. ThorData provides residential proxies, and ScrapeOps monitors pipelines.
Conclusion
Start with the official API, respect rate limits, and build incrementally for comprehensive startup ecosystem intelligence.
Top comments (0)