LinkedIn is the richest professional data source on the internet — 1 billion members, 65 million job listings, detailed company profiles, salary insights. If you're building recruitment tools, job market analysis, salary benchmarking software, or competitive intelligence, LinkedIn is almost certainly in your data plan.
But LinkedIn is also one of the most aggressively defended platforms. Their legal team sent cease-and-desist letters to major scrapers, their bot detection is sophisticated, and their rate limits are tightly enforced. This guide covers what works in 2026, with real code.
The Legal Landscape (Read This First)
The landmark case here is hiQ Labs v. LinkedIn, which went through multiple rounds of appeals before settling in 2022. The short version: scraping publicly accessible LinkedIn data (job listings, public company pages, public profiles) is generally legal under the CFAA in the US. This is similar to scraping Google search results.
What's not legal:
- Logging in with credentials to scrape gated content (violates CFAA and LinkedIn ToS)
- Mass scraping member profiles in bulk
- Scraping to build a competing network or database of user PII
Job listings are the cleanest target: they're explicitly public, LinkedIn wants them indexed by Google, and they contain no personal user data beyond company names and HR emails.
Disclaimer: This article is for educational purposes. Always review the ToS of any platform you scrape, consult legal counsel for commercial applications, and comply with GDPR/CCPA when handling any personal data.
What Data Is Available and How
LinkedIn has several distinct data layers:
| Data Type | Public? | Best Method |
|---|---|---|
| Job listings | Yes | Direct scraping or API |
| Company pages | Yes | Direct scraping |
| Public profiles | Yes (limited) | Very carefully |
| Connection graphs | No | Off-limits |
| Salary data | Partial (aggregated) | LinkedIn API only |
For job market intelligence, we focus on job listings — the safest and most valuable target.
Approach 1: LinkedIn's Official Job Search API
LinkedIn offers an official Jobs API, but it's restricted. To get access, you need a LinkedIn Partner Program application, which is gated behind a business review. For most developers, this isn't viable.
There is, however, a limited guest API that LinkedIn's own frontend uses. This isn't documented, isn't stable, and LinkedIn could break it anytime — but it's been functional for several years:
import httpx
import json
from typing import Optional
import time
class LinkedInJobScraper:
BASE_URL = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
JOB_DETAIL_URL = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.linkedin.com/jobs/",
}
def __init__(self, delay: float = 2.0):
self.delay = delay
self.client = httpx.Client(headers=self.HEADERS, timeout=30)
def search_jobs(
self,
keywords: str,
location: str = "",
geo_id: Optional[str] = None,
experience_levels: list[str] = None,
job_types: list[str] = None,
remote: bool = False,
start: int = 0,
) -> list[dict]:
"""
Search LinkedIn job listings.
experience_levels: "1" (internship), "2" (entry), "3" (associate),
"4" (mid-senior), "5" (director), "6" (executive)
job_types: "F" (full-time), "P" (part-time), "C" (contract),
"T" (temporary), "I" (internship), "V" (volunteer)
"""
params = {
"keywords": keywords,
"location": location,
"start": start,
"count": 25,
}
if geo_id:
params["geoId"] = geo_id
if remote:
params["f_WT"] = "2" # Remote work type
if experience_levels:
params["f_E"] = ",".join(experience_levels)
if job_types:
params["f_JT"] = ",".join(job_types)
response = self.client.get(self.BASE_URL, params=params)
if response.status_code == 429:
raise RateLimitError(f"Rate limited. Retry-After: {response.headers.get('Retry-After', 'unknown')}")
response.raise_for_status()
return self._parse_job_list(response.text)
def _parse_job_list(self, html: str) -> list[dict]:
"""Parse job listing HTML from LinkedIn's guest API."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
jobs = []
for card in soup.find_all("div", class_="base-card"):
try:
job_id = card.get("data-entity-urn", "").split(":")[-1]
title_el = card.find("h3", class_="base-search-card__title")
company_el = card.find("h4", class_="base-search-card__subtitle")
location_el = card.find("span", class_="job-search-card__location")
date_el = card.find("time")
jobs.append({
"job_id": job_id,
"title": title_el.get_text(strip=True) if title_el else None,
"company": company_el.get_text(strip=True) if company_el else None,
"location": location_el.get_text(strip=True) if location_el else None,
"posted_at": date_el.get("datetime") if date_el else None,
"job_url": f"https://www.linkedin.com/jobs/view/{job_id}",
})
except Exception as e:
continue
return jobs
def get_job_details(self, job_id: str) -> dict:
"""Fetch full job description for a specific job listing."""
from bs4 import BeautifulSoup
response = self.client.get(self.JOB_DETAIL_URL.format(job_id=job_id))
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
description = soup.find("div", class_="description__text")
criteria = soup.find_all("li", class_="description__job-criteria-item")
criteria_dict = {}
for item in criteria:
header = item.find("h3")
value = item.find("span")
if header and value:
criteria_dict[header.get_text(strip=True)] = value.get_text(strip=True)
return {
"job_id": job_id,
"description": description.get_text(separator="\n", strip=True) if description else None,
"criteria": criteria_dict,
}
def paginate_jobs(
self,
keywords: str,
location: str = "",
max_results: int = 500,
**kwargs,
) -> list[dict]:
"""Paginate through job search results."""
all_jobs = []
start = 0
while len(all_jobs) < max_results:
batch = self.search_jobs(keywords, location, start=start, **kwargs)
if not batch:
break
all_jobs.extend(batch)
start += 25
print(f"Collected {len(all_jobs)} jobs...")
time.sleep(self.delay) # Be polite
return all_jobs[:max_results]
class RateLimitError(Exception):
pass
# Usage
scraper = LinkedInJobScraper(delay=2.5)
jobs = scraper.paginate_jobs(
keywords="Python developer",
location="San Francisco, CA",
max_results=200,
remote=True,
job_types=["F"], # Full-time only
)
print(f"Found {len(jobs)} jobs")
Approach 2: Full Browser Automation with Playwright
For more complex searches or when the guest API returns incomplete data, Playwright gives you a full browser with JavaScript execution:
import asyncio
from playwright.async_api import async_playwright, Page
import json
import re
async def scrape_linkedin_jobs_browser(
search_term: str,
location: str,
max_jobs: int = 100,
) -> list[dict]:
"""Scrape LinkedIn jobs using a real browser."""
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
]
)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
viewport={"width": 1440, "height": 900},
# Mimic real browser locale settings
locale="en-US",
timezone_id="America/New_York",
)
# Override navigator.webdriver to avoid detection
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
page = await context.new_page()
# Navigate to LinkedIn jobs (no login required for public listings)
url = f"https://www.linkedin.com/jobs/search/?keywords={search_term}&location={location}"
await page.goto(url, wait_until="domcontentloaded")
# Wait for job cards to load
await page.wait_for_selector(".jobs-search__results-list", timeout=10000)
jobs = []
seen_ids = set()
# Scroll and collect jobs
while len(jobs) < max_jobs:
job_cards = await page.query_selector_all(".job-search-card")
for card in job_cards:
job_id = await card.get_attribute("data-id")
if not job_id or job_id in seen_ids:
continue
seen_ids.add(job_id)
title = await card.query_selector(".job-search-card__title")
company = await card.query_selector(".job-search-card__company-name")
location_el = await card.query_selector(".job-search-card__location")
jobs.append({
"job_id": job_id,
"title": await title.inner_text() if title else None,
"company": await company.inner_text() if company else None,
"location": await location_el.inner_text() if location_el else None,
"url": f"https://www.linkedin.com/jobs/view/{job_id}",
})
if len(jobs) >= max_jobs:
break
# Scroll to load more
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(2)
# Click "See more jobs" if present
see_more = await page.query_selector("button.infinite-scroller__show-more-button")
if see_more:
await see_more.click()
await asyncio.sleep(2)
else:
break # No more jobs to load
await browser.close()
return jobs[:max_jobs]
# Run it
jobs = asyncio.run(scrape_linkedin_jobs_browser(
search_term="machine learning engineer",
location="Remote",
max_jobs=50,
))
Approach 3: Dedicated LinkedIn Scraping Services
Maintaining your own LinkedIn scraper is high-maintenance work. LinkedIn actively deploys counter-measures: IP-based blocks, CAPTCHA challenges, JavaScript fingerprinting, and request pattern analysis.
For teams building production job market intelligence tools, a managed scraping service removes this operational burden. Services like the LinkedIn Job Scraper on Apify handle:
- Proxy rotation across residential IPs
- CAPTCHA solving
- Session management
- Structured output (JSON, CSV, or direct to your database)
- Legal compliance considerations
import httpx
import time
def scrape_linkedin_via_apify(
search_term: str,
location: str,
max_results: int = 1000,
apify_token: str = None,
) -> list[dict]:
"""Fetch LinkedIn job data via Apify's managed scraper."""
# Start the actor run
run_response = httpx.post(
"https://api.apify.com/v2/acts/cryptosignals~linkedin-jobs-scraper/runs",
headers={"Authorization": f"Bearer {apify_token}"},
json={
"searchTerms": [search_term],
"location": location,
"maxResults": max_results,
"datePosted": "past-week", # Options: past-24h, past-week, past-month, any-time
"jobType": "full-time",
},
timeout=30,
)
run_data = run_response.json()["data"]
run_id = run_data["id"]
print(f"Started run {run_id}, waiting for completion...")
# Poll for completion (in production, use webhooks)
while True:
status_response = httpx.get(
f"https://api.apify.com/v2/acts/cryptosignals~linkedin-jobs-scraper/runs/{run_id}",
headers={"Authorization": f"Bearer {apify_token}"},
)
status = status_response.json()["data"]["status"]
if status == "SUCCEEDED":
break
elif status in ("FAILED", "ABORTED", "TIMED-OUT"):
raise Exception(f"Run failed with status: {status}")
print(f"Status: {status}...")
time.sleep(10)
# Fetch results
dataset_id = status_response.json()["data"]["defaultDatasetId"]
results_response = httpx.get(
f"https://api.apify.com/v2/datasets/{dataset_id}/items",
headers={"Authorization": f"Bearer {apify_token}"},
params={"format": "json", "limit": max_results},
)
return results_response.json()
Building the Intelligence Layer
Raw job data is interesting. Processed job data is valuable. Here's how to turn a raw job dump into market intelligence:
import json
import re
from collections import Counter, defaultdict
from dataclasses import dataclass, field
import sqlite3
@dataclass
class JobMarketAnalysis:
total_jobs: int
top_companies: list[tuple[str, int]]
skill_frequency: dict[str, int]
location_distribution: dict[str, int]
remote_percentage: float
seniority_breakdown: dict[str, int]
# Common tech skills to track
TRACKED_SKILLS = [
"python", "javascript", "typescript", "rust", "go", "java", "c++",
"react", "vue", "angular", "fastapi", "django", "flask", "node.js",
"postgresql", "mysql", "mongodb", "redis", "elasticsearch",
"docker", "kubernetes", "aws", "gcp", "azure", "terraform",
"machine learning", "pytorch", "tensorflow", "scikit-learn",
"llm", "rag", "langchain", "openai",
]
SENIORITY_PATTERNS = {
"intern": r"\b(intern|internship)\b",
"junior": r"\b(junior|jr\.?|entry.?level|new grad)\b",
"mid": r"\b(mid.?level|intermediate|ii|2)\b",
"senior": r"\b(senior|sr\.?|iii|3)\b",
"lead": r"\b(lead|principal|staff)\b",
"manager": r"\b(manager|director|vp|head of)\b",
}
def analyze_jobs(jobs: list[dict]) -> JobMarketAnalysis:
"""Extract market intelligence from raw job listings."""
companies = Counter()
skills = Counter()
locations = Counter()
remote_count = 0
seniority = Counter()
for job in jobs:
title = (job.get("title") or "").lower()
description = (job.get("description") or "").lower()
location = (job.get("location") or "").strip()
company = (job.get("company") or "").strip()
full_text = f"{title} {description}"
# Count companies
if company:
companies[company] += 1
# Track skills in description
for skill in TRACKED_SKILLS:
if re.search(r'\b' + re.escape(skill) + r'\b', full_text):
skills[skill] += 1
# Location analysis
if location:
# Normalize "Remote" variants
if re.search(r'\bremote\b', location.lower()):
locations["Remote"] += 1
remote_count += 1
else:
# Extract city from "City, State, Country"
city = location.split(",")[0].strip()
locations[city] += 1
# Seniority detection
for level, pattern in SENIORITY_PATTERNS.items():
if re.search(pattern, title, re.IGNORECASE):
seniority[level] += 1
break
else:
seniority["unclassified"] += 1
return JobMarketAnalysis(
total_jobs=len(jobs),
top_companies=companies.most_common(20),
skill_frequency=dict(skills.most_common(30)),
location_distribution=dict(locations.most_common(20)),
remote_percentage=remote_count / len(jobs) * 100 if jobs else 0,
seniority_breakdown=dict(seniority),
)
def generate_market_report(analysis: JobMarketAnalysis, search_term: str) -> str:
"""Generate a human-readable market intelligence report."""
top_skills = sorted(analysis.skill_frequency.items(), key=lambda x: x[1], reverse=True)[:10]
report = f"""
# Job Market Intelligence: {search_term}
**Total listings analyzed**: {analysis.total_jobs:,}
**Remote percentage**: {analysis.remote_percentage:.1f}%
## Top Hiring Companies
{chr(10).join(f"- {company}: {count} openings" for company, count in analysis.top_companies[:10])}
## Most In-Demand Skills
{chr(10).join(f"- {skill}: {count} mentions ({count/analysis.total_jobs*100:.1f}%)" for skill, count in top_skills)}
## Seniority Distribution
{chr(10).join(f"- {level.title()}: {count} ({count/analysis.total_jobs*100:.1f}%)" for level, count in sorted(analysis.seniority_breakdown.items(), key=lambda x: x[1], reverse=True))}
## Top Locations
{chr(10).join(f"- {loc}: {count}" for loc, count in list(analysis.location_distribution.items())[:10])}
"""
return report
# Example usage
scraper = LinkedInJobScraper(delay=2.0)
jobs = scraper.paginate_jobs("data engineer", "United States", max_results=500)
# Enrich with descriptions (throttled to avoid rate limits)
for i, job in enumerate(jobs[:100]):
try:
details = scraper.get_job_details(job["job_id"])
job.update(details)
if i % 10 == 0:
print(f"Enriched {i}/100 jobs...")
time.sleep(1.5)
except Exception as e:
print(f"Failed to enrich job {job['job_id']}: {e}")
analysis = analyze_jobs(jobs)
report = generate_market_report(analysis, "data engineer")
print(report)
Rate Limiting and Getting Past Detection
LinkedIn's bot detection is ML-based and looks at:
- Request velocity — more than 3-5 requests/second is an immediate flag
- Session patterns — a session that only hits job search endpoints looks robotic
- Header consistency — mismatched Accept-Language/User-Agent is a signal
- JavaScript fingerprinting — in browser mode, missing browser APIs give you away
- IP reputation — datacenter IPs get flagged faster than residential
What works:
import asyncio
import random
async def human_like_delay(min_seconds: float = 1.5, max_seconds: float = 4.0):
"""Add human-like random delay between requests."""
# Real humans don't wait exactly the same time between actions
delay = random.gauss(
mu=(min_seconds + max_seconds) / 2,
sigma=(max_seconds - min_seconds) / 4
)
delay = max(min_seconds, min(max_seconds, delay))
await asyncio.sleep(delay)
# Rotate through 4-5 realistic User-Agent strings
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
]
def get_session_headers() -> dict:
"""Get a consistent set of headers for a scraping session."""
ua = random.choice(USER_AGENTS)
return {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
Signs you're getting close to a block:
- Responses come back but contain no job cards
- You see CAPTCHA challenges
- Requests start timing out
- You get redirected to the login page
When these happen: stop immediately, wait 24 hours, and restart with a fresh IP.
Common Pitfalls
Pitfall 1: Parsing closed job listings
LinkedIn keeps job listings live even after positions close. Always check for "No longer accepting applications" in the listing before including it in live analyses.
Pitfall 2: Duplicate company names
LinkedIn company names aren't normalized. "Meta", "Meta Platforms", "Meta Platforms Inc.", and "Facebook" are all the same company in different listings. Add a normalization step.
Pitfall 3: Location parsing
"Remote" can appear as: "Remote", "Remote, United States", "United States (Remote)", "Anywhere", "Work from Home". Use regex rather than exact matching.
Pitfall 4: Not handling pagination correctly
LinkedIn's start parameter works differently from typical offset-based pagination. After ~1000 results, LinkedIn caps results regardless of pagination. Segment your searches by date range, experience level, or job type to get around this.
Comparison: DIY vs. Service
| Factor | DIY Scraping | Apify/Managed Service |
|---|---|---|
| Setup time | 2-4 days | 30 minutes |
| Maintenance | Ongoing (LinkedIn changes) | Provider's problem |
| Cost at 1K jobs/day | ~$0 (just your time) | ~$5-10/month |
| Cost at 100K jobs/day | Proxy costs + devtime | ~$100-300/month |
| Reliability | You manage | SLA-backed |
| Data quality | Raw HTML parsing | Structured, cleaned |
| LinkedIn blocks | You handle | Provider handles |
For teams processing less than 10K jobs/month, DIY is fine. For production pipelines, the operational overhead of maintaining your own LinkedIn scraper typically exceeds the cost of a managed service within 2-3 months of maintenance incidents.
Putting It All Together
The complete pipeline: scrape job listings → enrich with descriptions → analyze for market intelligence → export to your downstream system:
import json
from pathlib import Path
def run_job_intelligence_pipeline(
search_terms: list[str],
locations: list[str],
output_dir: str = "./job_data",
):
"""Run a complete job intelligence collection pipeline."""
Path(output_dir).mkdir(exist_ok=True)
scraper = LinkedInJobScraper(delay=2.5)
all_jobs = []
for term in search_terms:
for location in locations:
print(f"Scraping: {term} in {location}")
try:
jobs = scraper.paginate_jobs(
term, location,
max_results=200,
remote=True,
)
for job in jobs:
job["search_term"] = term
job["search_location"] = location
all_jobs.extend(jobs)
print(f" → {len(jobs)} jobs found")
except RateLimitError:
print(f" → Rate limited, skipping")
continue
# Save raw data
output_file = Path(output_dir) / "jobs_raw.jsonl"
with open(output_file, "w") as f:
for job in all_jobs:
f.write(json.dumps(job) + "\n")
# Analyze and report
for term in search_terms:
term_jobs = [j for j in all_jobs if j.get("search_term") == term]
if term_jobs:
analysis = analyze_jobs(term_jobs)
report = generate_market_report(analysis, term)
report_file = Path(output_dir) / f"report_{term.replace(' ', '_')}.md"
report_file.write_text(report)
print(f"Report saved to {report_file}")
return all_jobs
# Run it
jobs = run_job_intelligence_pipeline(
search_terms=["python developer", "data engineer", "ml engineer"],
locations=["United States", "United Kingdom", "Germany"],
output_dir="./job_market_data",
)
print(f"Total: {len(jobs)} jobs collected across all searches")
LinkedIn job scraping in 2026 is technically viable when done respectfully. The key principles: stay in the publicly-accessible data layer, keep request rates human-like, handle failures gracefully, and use managed services when your volume justifies it. Build smart and you'll have a reliable data pipeline that runs for months.
Top comments (1)
the timeout problem is what pushed me away from self-hosted chromium
snapapi.pics handles this externally — you set a timeout in the request, their infra deals with the rest