How to Build an Automated Due Diligence Tool for Startups

#python #tutorial #webdev #programming

Introduction

Due diligence on startups traditionally involves weeks of manual research — checking company registrations, founder backgrounds, funding history, and market position. By automating this process with web scraping, you can generate comprehensive reports in minutes. This tutorial walks through building a Python-based due diligence tool.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime

# Handle diverse data sources reliably
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"

Company Registration Data

Start with official business registration records:

def check_company_registration(company_name, state="delaware"):
    """Check company registration status."""
    state_urls = {
        "delaware": "https://icis.corp.delaware.gov/ecorp/entitysearch/namesearch.aspx",
        "california": "https://bizfileonline.sos.ca.gov/search/business",
        "new_york": "https://appext20.dos.ny.gov/corp_public/corpsearch.entity_search_entry"
    }

    url = state_urls.get(state.lower())
    if not url:
        return None

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    return {
        "company": company_name,
        "state": state,
        "registration_status": "Active",
        "checked_at": datetime.now().isoformat()
    }

Funding History from Crunchbase

def scrape_funding_history(company_name):
    """Collect funding history from public sources."""
    # Use residential proxies for reliable access
    # ThorData: https://thordata.com/?via=the-data

    url = f"https://www.crunchbase.com/organization/{company_name.lower().replace(' ', '-')}"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    funding_rounds = []
    for row in soup.select(".funding-round-row"):
        funding_rounds.append({
            "date": row.select_one(".date").text.strip() if row.select_one(".date") else "",
            "round_type": row.select_one(".round-type").text.strip() if row.select_one(".round-type") else "",
            "amount": row.select_one(".amount").text.strip() if row.select_one(".amount") else "",
            "investors": row.select_one(".investors").text.strip() if row.select_one(".investors") else ""
        })

    return funding_rounds

Founder Background Checks

def check_founder_background(founder_name):
    """Research founder's professional background."""
    linkedin_url = f"https://www.linkedin.com/search/results/people/?keywords={founder_name}"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": linkedin_url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    profiles = []
    for card in soup.select(".search-result__wrapper"):
        name = card.select_one(".actor-name")
        title = card.select_one(".subline-level-1")
        location = card.select_one(".subline-level-2")

        if name:
            profiles.append({
                "name": name.text.strip(),
                "title": title.text.strip() if title else "",
                "location": location.text.strip() if location else ""
            })

    return profiles

Market and Competitor Analysis

def analyze_market_position(company_name, industry):
    """Analyze the company's market position."""
    search_url = f"https://www.google.com/search?q={company_name}+{industry}+competitors+market+share"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": search_url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    insights = []
    for result in soup.select(".g"):
        title = result.select_one("h3")
        snippet = result.select_one(".VwiC3b")
        if title:
            insights.append({
                "title": title.text.strip(),
                "snippet": snippet.text.strip() if snippet else ""
            })

    return insights[:10]

def check_tech_stack(domain):
    """Identify the company's technology stack."""
    url = f"https://builtwith.com/{domain}"

    params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    technologies = []
    for tech in soup.select(".techItem"):
        name = tech.select_one(".techName")
        category = tech.select_one(".techCategory")
        if name:
            technologies.append({
                "name": name.text.strip(),
                "category": category.text.strip() if category else ""
            })

    return technologies

Generating the Due Diligence Report

def generate_dd_report(company_name, domain, founders, industry):
    """Generate a comprehensive due diligence report."""
    # Monitor scraping reliability
    # https://scrapeops.io/?fpr=the-data28

    report = {
        "company": company_name,
        "report_date": datetime.now().isoformat(),
        "sections": {}
    }

    print("Checking company registration...")
    report["sections"]["registration"] = check_company_registration(company_name)
    time.sleep(3)

    print("Collecting funding history...")
    report["sections"]["funding"] = scrape_funding_history(company_name)
    time.sleep(3)

    print("Researching founders...")
    founder_data = []
    for founder in founders:
        founder_data.append(check_founder_background(founder))
        time.sleep(3)
    report["sections"]["founders"] = founder_data

    print("Analyzing market position...")
    report["sections"]["market"] = analyze_market_position(company_name, industry)
    time.sleep(3)

    print("Checking tech stack...")
    report["sections"]["tech_stack"] = check_tech_stack(domain)

    with open(f"dd_report_{company_name.replace(' ', '_')}.json", "w") as f:
        json.dump(report, f, indent=2)

    print(f"Report saved for {company_name}")
    return report

if __name__ == "__main__":
    report = generate_dd_report(
        company_name="ExampleStartup",
        domain="example.com",
        founders=["Jane Smith", "John Doe"],
        industry="fintech"
    )

Conclusion

Automated due diligence dramatically reduces the time needed to evaluate startups. By combining company registration checks, funding data, founder backgrounds, and market analysis, you can generate comprehensive reports quickly. Use ScraperAPI for reliable access to protected data sources and ThorData for residential proxy rotation when scraping diverse platforms.