DEV Community

agenthustler
agenthustler

Posted on

Scraping LinkedIn Sales Navigator: Lead Generation at Scale

Scraping LinkedIn Sales Navigator: Lead Generation at Scale

LinkedIn Sales Navigator is one of the most valuable B2B data sources. Understanding how to work with this data programmatically is essential for sales engineers building lead generation pipelines.

Legal Disclaimer

LinkedIn actively fights scraping. This guide is for educational purposes and working with data you have legitimate access to. Consider LinkedIn's official APIs for production use.

A Responsible Approach

Instead of direct scraping, we build a framework that works with exported CSV data, enriches leads from public sources, and structures everything CRM-ready.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, time
from urllib.parse import quote_plus

class LeadEnrichmentPipeline:
    def __init__(self, api_key=None):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'LeadResearch/1.0'})
        self.api_key = api_key

    def _fetch(self, url):
        if self.api_key:
            return self.session.get(
                f"http://api.scraperapi.com?api_key={self.api_key}&url={quote_plus(url)}")
        return self.session.get(url)

    def load_export(self, csv_path):
        df = pd.read_csv(csv_path)
        df.columns = [c.strip().lower().replace(' ','_') for c in df.columns]
        return df

    def enrich_company(self, company):
        url = f"https://www.google.com/search?q={quote_plus(company)}+site:crunchbase.com"
        resp = self._fetch(url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        data = {'name': company}
        for r in soup.select('.g a'):
            if 'crunchbase.com/organization' in r.get('href',''):
                data['crunchbase_url'] = r['href']
                break
        return data

    def find_email_pattern(self, domain):
        resp = self._fetch(f"https://{domain}")
        soup = BeautifulSoup(resp.text, 'html.parser')
        emails = set()
        pat = re.compile(r'[a-zA-Z0-9._%+-]+@' + re.escape(domain))
        for m in pat.finditer(soup.get_text()):
            emails.add(m.group().lower())
        for link in soup.select('a[href^="mailto:"]'):
            e = link['href'].replace('mailto:','').split('?')[0]
            if domain in e:
                emails.add(e.lower())
        patterns = {}
        for e in emails:
            local = e.split('@')[0]
            if '.' in local:
                patterns['first.last'] = patterns.get('first.last',0)+1
        return max(patterns, key=patterns.get) if patterns else 'unknown'
Enter fullscreen mode Exit fullscreen mode

Lead Scoring

class LeadScorer:
    def __init__(self):
        self.weights = {'title': 30, 'size': 20, 'industry': 25, 'signals': 15, 'recency': 10}

    def score(self, lead, ideal):
        s = 0
        if any(t.lower() in lead.get('title','').lower() for t in ideal.get('titles',[])):
            s += self.weights['title']
        emp = lead.get('company_employees', 0)
        lo, hi = ideal.get('company_size', (50, 500))
        if lo <= emp <= hi:
            s += self.weights['size']
        if lead.get('industry') in ideal.get('industries', []):
            s += self.weights['industry']
        return s

    def qualify(self, df, ideal, threshold=50):
        df['score'] = df.apply(lambda r: self.score(r.to_dict(), ideal), axis=1)
        return df[df['score'] >= threshold].sort_values('score', ascending=False)
Enter fullscreen mode Exit fullscreen mode

Full Pipeline

def run_pipeline(csv_path, ideal, api_key=None):
    pipe = LeadEnrichmentPipeline(api_key=api_key)
    scorer = LeadScorer()
    leads = pipe.load_export(csv_path)
    print(f"Loaded {len(leads)} leads")
    for idx, row in leads.iterrows():
        co = row.get('company', '')
        if co:
            info = pipe.enrich_company(co)
            for k, v in info.items():
                leads.at[idx, k] = v
        time.sleep(1)
    qualified = scorer.qualify(leads, ideal)
    print(f"Qualified: {len(qualified)}/{len(leads)}")
    qualified.to_csv('qualified_leads.csv', index=False)
    return qualified
Enter fullscreen mode Exit fullscreen mode

For enriching hundreds of leads, ScraperAPI handles IP rotation and CAPTCHAs. ThorData provides residential proxies. Track pipeline health with ScrapeOps.

Best Practices

  1. Start with your own CSV export data
  2. Enrich, don't scrape directly
  3. Space requests 1-2 seconds apart
  4. Verify before outreach
  5. Re-enrich leads monthly

Follow for more Python automation tutorials.

Top comments (0)