Introduction
Due diligence on startups traditionally involves weeks of manual research — checking company registrations, founder backgrounds, funding history, and market position. By automating this process with web scraping, you can generate comprehensive reports in minutes. This tutorial walks through building a Python-based due diligence tool.
Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime
# Handle diverse data sources reliably
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Company Registration Data
Start with official business registration records:
def check_company_registration(company_name, state="delaware"):
"""Check company registration status."""
state_urls = {
"delaware": "https://icis.corp.delaware.gov/ecorp/entitysearch/namesearch.aspx",
"california": "https://bizfileonline.sos.ca.gov/search/business",
"new_york": "https://appext20.dos.ny.gov/corp_public/corpsearch.entity_search_entry"
}
url = state_urls.get(state.lower())
if not url:
return None
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
return {
"company": company_name,
"state": state,
"registration_status": "Active",
"checked_at": datetime.now().isoformat()
}
Funding History from Crunchbase
def scrape_funding_history(company_name):
"""Collect funding history from public sources."""
# Use residential proxies for reliable access
# ThorData: https://thordata.com/?via=the-data
url = f"https://www.crunchbase.com/organization/{company_name.lower().replace(' ', '-')}"
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
funding_rounds = []
for row in soup.select(".funding-round-row"):
funding_rounds.append({
"date": row.select_one(".date").text.strip() if row.select_one(".date") else "",
"round_type": row.select_one(".round-type").text.strip() if row.select_one(".round-type") else "",
"amount": row.select_one(".amount").text.strip() if row.select_one(".amount") else "",
"investors": row.select_one(".investors").text.strip() if row.select_one(".investors") else ""
})
return funding_rounds
Founder Background Checks
def check_founder_background(founder_name):
"""Research founder's professional background."""
linkedin_url = f"https://www.linkedin.com/search/results/people/?keywords={founder_name}"
params = {
"api_key": SCRAPER_API_KEY,
"url": linkedin_url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
profiles = []
for card in soup.select(".search-result__wrapper"):
name = card.select_one(".actor-name")
title = card.select_one(".subline-level-1")
location = card.select_one(".subline-level-2")
if name:
profiles.append({
"name": name.text.strip(),
"title": title.text.strip() if title else "",
"location": location.text.strip() if location else ""
})
return profiles
Market and Competitor Analysis
def analyze_market_position(company_name, industry):
"""Analyze the company's market position."""
search_url = f"https://www.google.com/search?q={company_name}+{industry}+competitors+market+share"
params = {
"api_key": SCRAPER_API_KEY,
"url": search_url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
insights = []
for result in soup.select(".g"):
title = result.select_one("h3")
snippet = result.select_one(".VwiC3b")
if title:
insights.append({
"title": title.text.strip(),
"snippet": snippet.text.strip() if snippet else ""
})
return insights[:10]
def check_tech_stack(domain):
"""Identify the company's technology stack."""
url = f"https://builtwith.com/{domain}"
params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
technologies = []
for tech in soup.select(".techItem"):
name = tech.select_one(".techName")
category = tech.select_one(".techCategory")
if name:
technologies.append({
"name": name.text.strip(),
"category": category.text.strip() if category else ""
})
return technologies
Generating the Due Diligence Report
def generate_dd_report(company_name, domain, founders, industry):
"""Generate a comprehensive due diligence report."""
# Monitor scraping reliability
# https://scrapeops.io/?fpr=the-data28
report = {
"company": company_name,
"report_date": datetime.now().isoformat(),
"sections": {}
}
print("Checking company registration...")
report["sections"]["registration"] = check_company_registration(company_name)
time.sleep(3)
print("Collecting funding history...")
report["sections"]["funding"] = scrape_funding_history(company_name)
time.sleep(3)
print("Researching founders...")
founder_data = []
for founder in founders:
founder_data.append(check_founder_background(founder))
time.sleep(3)
report["sections"]["founders"] = founder_data
print("Analyzing market position...")
report["sections"]["market"] = analyze_market_position(company_name, industry)
time.sleep(3)
print("Checking tech stack...")
report["sections"]["tech_stack"] = check_tech_stack(domain)
with open(f"dd_report_{company_name.replace(' ', '_')}.json", "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved for {company_name}")
return report
if __name__ == "__main__":
report = generate_dd_report(
company_name="ExampleStartup",
domain="example.com",
founders=["Jane Smith", "John Doe"],
industry="fintech"
)
Conclusion
Automated due diligence dramatically reduces the time needed to evaluate startups. By combining company registration checks, funding data, founder backgrounds, and market analysis, you can generate comprehensive reports quickly. Use ScraperAPI for reliable access to protected data sources and ThorData for residential proxy rotation when scraping diverse platforms.
Top comments (0)