Scraping LinkedIn Sales Navigator: Lead Generation at Scale
LinkedIn Sales Navigator is one of the most valuable B2B data sources. Understanding how to work with this data programmatically is essential for sales engineers building lead generation pipelines.
Legal Disclaimer
LinkedIn actively fights scraping. This guide is for educational purposes and working with data you have legitimate access to. Consider LinkedIn's official APIs for production use.
A Responsible Approach
Instead of direct scraping, we build a framework that works with exported CSV data, enriches leads from public sources, and structures everything CRM-ready.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, time
from urllib.parse import quote_plus
class LeadEnrichmentPipeline:
def __init__(self, api_key=None):
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'LeadResearch/1.0'})
self.api_key = api_key
def _fetch(self, url):
if self.api_key:
return self.session.get(
f"http://api.scraperapi.com?api_key={self.api_key}&url={quote_plus(url)}")
return self.session.get(url)
def load_export(self, csv_path):
df = pd.read_csv(csv_path)
df.columns = [c.strip().lower().replace(' ','_') for c in df.columns]
return df
def enrich_company(self, company):
url = f"https://www.google.com/search?q={quote_plus(company)}+site:crunchbase.com"
resp = self._fetch(url)
soup = BeautifulSoup(resp.text, 'html.parser')
data = {'name': company}
for r in soup.select('.g a'):
if 'crunchbase.com/organization' in r.get('href',''):
data['crunchbase_url'] = r['href']
break
return data
def find_email_pattern(self, domain):
resp = self._fetch(f"https://{domain}")
soup = BeautifulSoup(resp.text, 'html.parser')
emails = set()
pat = re.compile(r'[a-zA-Z0-9._%+-]+@' + re.escape(domain))
for m in pat.finditer(soup.get_text()):
emails.add(m.group().lower())
for link in soup.select('a[href^="mailto:"]'):
e = link['href'].replace('mailto:','').split('?')[0]
if domain in e:
emails.add(e.lower())
patterns = {}
for e in emails:
local = e.split('@')[0]
if '.' in local:
patterns['first.last'] = patterns.get('first.last',0)+1
return max(patterns, key=patterns.get) if patterns else 'unknown'
Lead Scoring
class LeadScorer:
def __init__(self):
self.weights = {'title': 30, 'size': 20, 'industry': 25, 'signals': 15, 'recency': 10}
def score(self, lead, ideal):
s = 0
if any(t.lower() in lead.get('title','').lower() for t in ideal.get('titles',[])):
s += self.weights['title']
emp = lead.get('company_employees', 0)
lo, hi = ideal.get('company_size', (50, 500))
if lo <= emp <= hi:
s += self.weights['size']
if lead.get('industry') in ideal.get('industries', []):
s += self.weights['industry']
return s
def qualify(self, df, ideal, threshold=50):
df['score'] = df.apply(lambda r: self.score(r.to_dict(), ideal), axis=1)
return df[df['score'] >= threshold].sort_values('score', ascending=False)
Full Pipeline
def run_pipeline(csv_path, ideal, api_key=None):
pipe = LeadEnrichmentPipeline(api_key=api_key)
scorer = LeadScorer()
leads = pipe.load_export(csv_path)
print(f"Loaded {len(leads)} leads")
for idx, row in leads.iterrows():
co = row.get('company', '')
if co:
info = pipe.enrich_company(co)
for k, v in info.items():
leads.at[idx, k] = v
time.sleep(1)
qualified = scorer.qualify(leads, ideal)
print(f"Qualified: {len(qualified)}/{len(leads)}")
qualified.to_csv('qualified_leads.csv', index=False)
return qualified
For enriching hundreds of leads, ScraperAPI handles IP rotation and CAPTCHAs. ThorData provides residential proxies. Track pipeline health with ScrapeOps.
Best Practices
- Start with your own CSV export data
- Enrich, don't scrape directly
- Space requests 1-2 seconds apart
- Verify before outreach
- Re-enrich leads monthly
Follow for more Python automation tutorials.
Top comments (0)