DEV Community

agenthustler
agenthustler

Posted on

Scraping B2B Lead Data: Combining LinkedIn, Clearbit, and Hunter.io

B2B lead generation runs on data. The best sales teams enrich prospects with company data, verified emails, and professional profiles. Here's how to build an automated lead enrichment pipeline combining multiple data sources.

The Lead Enrichment Stack

  • LinkedIn: Professional profiles, job titles, company associations
  • Clearbit: Company data, technographics, funding info
  • Hunter.io: Email discovery and verification
  • Public sources: Company websites, press releases, SEC filings

Hunter.io Email Discovery

Hunter.io offers a generous free tier (25 searches/month) and affordable paid plans:

import requests
import time

class HunterClient:
    BASE_URL = "https://api.hunter.io/v2"

    def __init__(self, api_key):
        self.api_key = api_key

    def domain_search(self, domain, limit=10):
        resp = requests.get(f"{self.BASE_URL}/domain-search", params={
            "domain": domain,
            "api_key": self.api_key,
            "limit": limit,
        })
        data = resp.json().get("data", {})
        return [{
            "email": e["value"],
            "type": e.get("type"),
            "confidence": e.get("confidence"),
            "first_name": e.get("first_name"),
            "last_name": e.get("last_name"),
            "position": e.get("position"),
        } for e in data.get("emails", [])]

    def find_email(self, domain, first_name, last_name):
        resp = requests.get(f"{self.BASE_URL}/email-finder", params={
            "domain": domain,
            "first_name": first_name,
            "last_name": last_name,
            "api_key": self.api_key,
        })
        data = resp.json().get("data", {})
        return {
            "email": data.get("email"),
            "confidence": data.get("confidence"),
            "sources": data.get("sources"),
        }

    def verify_email(self, email):
        resp = requests.get(f"{self.BASE_URL}/email-verifier", params={
            "email": email,
            "api_key": self.api_key,
        })
        data = resp.json().get("data", {})
        return {
            "status": data.get("status"),  # valid, invalid, accept_all
            "disposable": data.get("disposable"),
            "webmail": data.get("webmail"),
        }
Enter fullscreen mode Exit fullscreen mode

Clearbit Company Enrichment

class ClearbitClient:
    def __init__(self, api_key):
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers["Authorization"] = f"Bearer {api_key}"

    def enrich_company(self, domain):
        resp = self.session.get(
            f"https://company.clearbit.com/v2/companies/find",
            params={"domain": domain}
        )
        if resp.status_code != 200:
            return None
        data = resp.json()
        return {
            "name": data.get("name"),
            "domain": data.get("domain"),
            "industry": data.get("category", {}).get("industry"),
            "employee_count": data.get("metrics", {}).get("employees"),
            "revenue_range": data.get("metrics", {}).get("estimatedAnnualRevenue"),
            "tech_stack": data.get("tech", []),
            "funding": data.get("metrics", {}).get("raised"),
            "location": data.get("geo", {}).get("city"),
            "description": data.get("description"),
        }
Enter fullscreen mode Exit fullscreen mode

Scraping Company Websites for Context

API_KEY = "YOUR_SCRAPERAPI_KEY"

def scrape_company_page(url):
    params = {
        "api_key": API_KEY,
        "url": url,
        "render": "true"
    }
    resp = requests.get(
        "https://api.scraperapi.com", params=params, timeout=60
    )
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract key information
    info = {"url": url}

    # Team/About page links
    for link in soup.find_all("a"):
        href = link.get("href", "").lower()
        text = link.get_text(strip=True).lower()
        if any(kw in href or kw in text for kw in ["team", "about", "leadership"]):
            info["team_page"] = link.get("href")
            break

    # Tech indicators from meta tags and scripts
    scripts = [s.get("src", "") for s in soup.find_all("script") if s.get("src")]
    info["tech_indicators"] = detect_tech(scripts)
    return info

def detect_tech(script_urls):
    tech = []
    patterns = {
        "react": "react", "vue": "vue", "angular": "angular",
        "stripe": "stripe", "intercom": "intercom",
        "hubspot": "hubspot", "segment": "segment",
        "google_analytics": "google-analytics|gtag",
    }
    for url in script_urls:
        for tech_name, pattern in patterns.items():
            if re.search(pattern, url, re.I):
                tech.append(tech_name)
    return list(set(tech))
Enter fullscreen mode Exit fullscreen mode

ScraperAPI ensures reliable access to company websites regardless of their security setup.

The Full Enrichment Pipeline

from bs4 import BeautifulSoup
import re
import csv

class LeadEnrichmentPipeline:
    def __init__(self, hunter_key, clearbit_key, scraper_key):
        self.hunter = HunterClient(hunter_key)
        self.clearbit = ClearbitClient(clearbit_key)
        self.scraper_key = scraper_key

    def enrich_lead(self, domain, first_name=None, last_name=None):
        lead = {"domain": domain}

        # Step 1: Company enrichment
        company = self.clearbit.enrich_company(domain)
        if company:
            lead["company"] = company

        # Step 2: Email discovery
        if first_name and last_name:
            email_result = self.hunter.find_email(domain, first_name, last_name)
            lead["email"] = email_result
        else:
            emails = self.hunter.domain_search(domain, limit=5)
            lead["emails"] = emails

        # Step 3: Website intelligence
        lead["web_intel"] = scrape_company_page(f"https://{domain}")

        time.sleep(1)  # Rate limiting
        return lead

    def enrich_batch(self, leads):
        enriched = []
        for lead in leads:
            result = self.enrich_lead(
                lead["domain"],
                lead.get("first_name"),
                lead.get("last_name")
            )
            enriched.append(result)
            time.sleep(2)
        return enriched

# Example usage
pipeline = LeadEnrichmentPipeline(
    hunter_key="YOUR_HUNTER_KEY",
    clearbit_key="YOUR_CLEARBIT_KEY",
    scraper_key=API_KEY,
)

leads = [
    {"domain": "stripe.com", "first_name": "John", "last_name": "Doe"},
    {"domain": "notion.so"},
    {"domain": "linear.app"},
]
enriched = pipeline.enrich_batch(leads)
Enter fullscreen mode Exit fullscreen mode

Exporting to CRM Format

def export_to_csv(enriched_leads, filename="leads_enriched.csv"):
    with open(filename, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Domain", "Company", "Industry", "Employees",
            "Revenue", "Email", "Confidence", "Tech Stack"
        ])
        for lead in enriched_leads:
            company = lead.get("company", {})
            email = lead.get("email", {})
            writer.writerow([
                lead["domain"],
                company.get("name", ""),
                company.get("industry", ""),
                company.get("employee_count", ""),
                company.get("revenue_range", ""),
                email.get("email", ""),
                email.get("confidence", ""),
                ", ".join(company.get("tech_stack", [])),
            ])
Enter fullscreen mode Exit fullscreen mode

For scaling lead enrichment across thousands of companies, use ThorData for residential proxies and ScrapeOps for monitoring.


B2B lead enrichment combines multiple data sources into actionable intelligence. API-first services like Hunter.io and Clearbit provide structured data, while web scraping fills the gaps. The key is building a pipeline that enriches reliably and respects rate limits.

Happy scraping!

Top comments (0)