Scraping LinkedIn Sales Navigator: Lead Generation at Scale

#python #tutorial #webdev #programming

Scraping LinkedIn Sales Navigator: Lead Generation at Scale

LinkedIn Sales Navigator is one of the most valuable B2B data sources. Understanding how to work with this data programmatically is essential for sales engineers building lead generation pipelines.

Legal Disclaimer

LinkedIn actively fights scraping. This guide is for educational purposes and working with data you have legitimate access to. Consider LinkedIn's official APIs for production use.

A Responsible Approach

Instead of direct scraping, we build a framework that works with exported CSV data, enriches leads from public sources, and structures everything CRM-ready.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, time
from urllib.parse import quote_plus

class LeadEnrichmentPipeline:
    def __init__(self, api_key=None):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'LeadResearch/1.0'})
        self.api_key = api_key

    def _fetch(self, url):
        if self.api_key:
            return self.session.get(
                f"http://api.scraperapi.com?api_key={self.api_key}&url={quote_plus(url)}")
        return self.session.get(url)

    def load_export(self, csv_path):
        df = pd.read_csv(csv_path)
        df.columns = [c.strip().lower().replace(' ','_') for c in df.columns]
        return df

    def enrich_company(self, company):
        url = f"https://www.google.com/search?q={quote_plus(company)}+site:crunchbase.com"
        resp = self._fetch(url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        data = {'name': company}
        for r in soup.select('.g a'):
            if 'crunchbase.com/organization' in r.get('href',''):
                data['crunchbase_url'] = r['href']
                break
        return data

    def find_email_pattern(self, domain):
        resp = self._fetch(f"https://{domain}")
        soup = BeautifulSoup(resp.text, 'html.parser')
        emails = set()
        pat = re.compile(r'[a-zA-Z0-9._%+-]+@' + re.escape(domain))
        for m in pat.finditer(soup.get_text()):
            emails.add(m.group().lower())
        for link in soup.select('a[href^="mailto:"]'):
            e = link['href'].replace('mailto:','').split('?')[0]
            if domain in e:
                emails.add(e.lower())
        patterns = {}
        for e in emails:
            local = e.split('@')[0]
            if '.' in local:
                patterns['first.last'] = patterns.get('first.last',0)+1
        return max(patterns, key=patterns.get) if patterns else 'unknown'

Lead Scoring

class LeadScorer:
    def __init__(self):
        self.weights = {'title': 30, 'size': 20, 'industry': 25, 'signals': 15, 'recency': 10}

    def score(self, lead, ideal):
        s = 0
        if any(t.lower() in lead.get('title','').lower() for t in ideal.get('titles',[])):
            s += self.weights['title']
        emp = lead.get('company_employees', 0)
        lo, hi = ideal.get('company_size', (50, 500))
        if lo <= emp <= hi:
            s += self.weights['size']
        if lead.get('industry') in ideal.get('industries', []):
            s += self.weights['industry']
        return s

    def qualify(self, df, ideal, threshold=50):
        df['score'] = df.apply(lambda r: self.score(r.to_dict(), ideal), axis=1)
        return df[df['score'] >= threshold].sort_values('score', ascending=False)

Full Pipeline

def run_pipeline(csv_path, ideal, api_key=None):
    pipe = LeadEnrichmentPipeline(api_key=api_key)
    scorer = LeadScorer()
    leads = pipe.load_export(csv_path)
    print(f"Loaded {len(leads)} leads")
    for idx, row in leads.iterrows():
        co = row.get('company', '')
        if co:
            info = pipe.enrich_company(co)
            for k, v in info.items():
                leads.at[idx, k] = v
        time.sleep(1)
    qualified = scorer.qualify(leads, ideal)
    print(f"Qualified: {len(qualified)}/{len(leads)}")
    qualified.to_csv('qualified_leads.csv', index=False)
    return qualified

For enriching hundreds of leads, ScraperAPI handles IP rotation and CAPTCHAs. ThorData provides residential proxies. Track pipeline health with ScrapeOps.

Best Practices

Start with your own CSV export data
Enrich, don't scrape directly
Space requests 1-2 seconds apart
Verify before outreach
Re-enrich leads monthly

Follow for more Python automation tutorials.

DEV Community

Scraping LinkedIn Sales Navigator: Lead Generation at Scale

Scraping LinkedIn Sales Navigator: Lead Generation at Scale

Legal Disclaimer

A Responsible Approach

Lead Scoring

Full Pipeline

Best Practices

Top comments (0)