LinkedIn is the largest professional network with rich company and employee data. While LinkedIn's terms restrict unauthorized scraping, there are legitimate ways to extract public business data for research, lead generation, and market analysis. This guide covers both API-based and scraping approaches.
Legal Disclaimer
Always respect LinkedIn's Terms of Service. Use their official API where possible. The techniques shown here are for educational purposes. For production use, consider LinkedIn's official data products or authorized third-party providers.
Method 1: LinkedIn API (Official)
LinkedIn offers APIs for authorized applications:
import requests
class LinkedInAPI:
BASE_URL = "https://api.linkedin.com/v2"
def __init__(self, access_token):
self.headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/json"
}
def get_company(self, company_id):
url = f"{self.BASE_URL}/organizations/{company_id}"
params = {
"projection": "(id,name,description,websiteUrl,industries,staffCount,headquarter)"
}
response = requests.get(url, headers=self.headers, params=params)
return response.json()
def search_companies(self, keyword, start=0, count=10):
url = f"{self.BASE_URL}/search/companies"
params = {
"keywords": keyword,
"start": start,
"count": count
}
response = requests.get(url, headers=self.headers, params=params)
return response.json()
api = LinkedInAPI(access_token="YOUR_TOKEN")
company = api.get_company("1441")
print(f"Company: {company.get('name')}")
print(f"Staff: {company.get('staffCount')}")
Method 2: Public Profile Scraping
Public LinkedIn profiles are accessible without authentication. Using a proxy service ensures you don't get rate limited:
import requests
from bs4 import BeautifulSoup
import json
def scrape_company_page(company_slug, api_key):
url = f"https://www.linkedin.com/company/{company_slug}/"
proxy_url = f"http://api.scraperapi.com?api_key={api_key}&url={url}&render=true"
response = requests.get(proxy_url)
soup = BeautifulSoup(response.text, "html.parser")
company_data = {}
# Extract from JSON-LD structured data
script_tags = soup.find_all("script", type="application/ld+json")
for script in script_tags:
try:
data = json.loads(script.string)
if data.get("@type") == "Organization":
company_data = {
"name": data.get("name"),
"description": data.get("description"),
"url": data.get("url"),
"employee_count": data.get("numberOfEmployees", {}).get("value"),
"industry": data.get("industry"),
"founded": data.get("foundingDate")
}
except (json.JSONDecodeError, AttributeError):
continue
return company_data
company = scrape_company_page("google", "YOUR_API_KEY")
print(json.dumps(company, indent=2))
Method 3: Google Dorking for LinkedIn Data
Google indexes many LinkedIn profiles. You can search Google for LinkedIn pages:
def search_linkedin_via_google(query, api_key):
search_query = f"site:linkedin.com/company {query}"
google_url = f"https://www.google.com/search?q={search_query}&num=10"
proxy_url = f"http://api.scraperapi.com?api_key={api_key}&url={google_url}"
response = requests.get(proxy_url)
soup = BeautifulSoup(response.text, "html.parser")
results = []
for item in soup.find_all("div", class_="g"):
link = item.find("a")
title = item.find("h3")
if link and title and "linkedin.com" in link.get("href", ""):
results.append({
"title": title.get_text(strip=True),
"url": link["href"]
})
return results
companies = search_linkedin_via_google("AI startup San Francisco", "YOUR_KEY")
for c in companies:
print(f"{c['title']} - {c['url']}")
Building a Company Research Tool
import time
import pandas as pd
class CompanyResearcher:
def __init__(self, api_key):
self.api_key = api_key
self.companies = []
def research_batch(self, company_slugs):
for slug in company_slugs:
try:
data = scrape_company_page(slug, self.api_key)
data["slug"] = slug
self.companies.append(data)
print(f"Scraped: {data.get('name', slug)}")
time.sleep(3) # Respect rate limits
except Exception as e:
print(f"Error scraping {slug}: {e}")
return self.companies
def export(self, filename="companies.csv"):
df = pd.DataFrame(self.companies)
df.to_csv(filename, index=False)
return df
researcher = CompanyResearcher(api_key="YOUR_KEY")
slugs = ["google", "microsoft", "apple", "meta", "amazon"]
researcher.research_batch(slugs)
df = researcher.export()
print(df[["name", "employee_count", "industry"]])
Employee Data from Public Posts
def scrape_company_posts(company_slug, api_key):
url = f"https://www.linkedin.com/company/{company_slug}/posts/"
proxy_url = f"http://api.scraperapi.com?api_key={api_key}&url={url}&render=true"
response = requests.get(proxy_url)
soup = BeautifulSoup(response.text, "html.parser")
posts = []
post_elements = soup.find_all("div", class_=lambda c: c and "feed-shared" in c if c else False)
for post in post_elements[:10]:
author = post.find("span", class_=lambda c: c and "actor" in c.lower() if c else False)
content = post.find("div", class_=lambda c: c and "commentary" in c.lower() if c else False)
posts.append({
"author": author.get_text(strip=True) if author else "Company",
"content": content.get_text(strip=True)[:200] if content else ""
})
return posts
Proxy Strategy for LinkedIn
LinkedIn has some of the most aggressive anti-bot measures online. Here's what works:
-
ScraperAPI with
render=true— handles JavaScript rendering and CAPTCHA challenges - ThorData residential proxies — essential for LinkedIn since datacenter IPs are instantly blocked
- Rate limiting — never exceed 1 request per 5 seconds per profile
- Session rotation — use fresh sessions for each batch
Monitoring Your Scrapers
Use ScrapeOps to monitor success rates. LinkedIn scraping typically has lower success rates than other sites, so tracking helps you optimize your approach.
Ethical Guidelines
- Only collect publicly available data
- Don't scrape private profiles or content behind logins
- Respect rate limits and robots.txt
- Don't use data for spam or unsolicited outreach
- Consider using LinkedIn's official API or Sales Navigator for business use
Conclusion
LinkedIn data is valuable for business research, competitive analysis, and market intelligence. Start with the official API, supplement with public page scraping, and always prioritize ethical data collection. The combination of official APIs and careful scraping gives you comprehensive professional data while staying on the right side of platform policies.
Happy scraping!
Top comments (0)