Remote work job boards contain rich data about compensation, skill requirements, and hiring trends. Scraping this data systematically reveals what the market actually pays and demands.
Why Scrape Remote Job Data?
Salary surveys are self-reported and biased. Actual job postings show what companies offer right now. Scraping multiple boards gives you the most accurate picture of remote work compensation and requirements.
Setup
pip install requests beautifulsoup4 pandas
Scraping Remote Job Boards
Here's a scraper framework for remote job boards:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
def scrape_remote_jobs(category="software-dev", pages=5):
jobs = []
for page in range(1, pages + 1):
params = {
"api_key": "YOUR_SCRAPERAPI_KEY",
"url": f"https://weworkremotely.com/categories/{category}?page={page}",
"render": "false"
}
response = requests.get("https://api.scraperapi.com", params=params)
soup = BeautifulSoup(response.text, "html.parser")
for listing in soup.select(".feature"):
company_el = listing.select_one(".company")
title_el = listing.select_one(".title")
region_el = listing.select_one(".region")
link = listing.select_one("a[href*='/remote-jobs/']")
jobs.append({
"company": company_el.text.strip() if company_el else "",
"title": title_el.text.strip() if title_el else "",
"region": region_el.text.strip() if region_el else "Anywhere",
"url": f"https://weworkremotely.com{link['href']}" if link else "",
"source": "WeWorkRemotely"
})
time.sleep(1)
return jobs
def scrape_remoteok():
params = {
"api_key": "YOUR_SCRAPERAPI_KEY",
"url": "https://remoteok.com/api",
}
response = requests.get("https://api.scraperapi.com", params=params)
try:
data = response.json()
except:
return []
jobs = []
for item in data[1:]: # First item is metadata
salary_min = item.get("salary_min", 0)
salary_max = item.get("salary_max", 0)
jobs.append({
"company": item.get("company", ""),
"title": item.get("position", ""),
"tags": item.get("tags", []),
"salary_min": salary_min,
"salary_max": salary_max,
"location": item.get("location", "Remote"),
"url": item.get("url", ""),
"date": item.get("date", ""),
"source": "RemoteOK"
})
return jobs
wwr_jobs = scrape_remote_jobs("software-dev", pages=5)
rok_jobs = scrape_remoteok()
print(f"Collected {len(wwr_jobs)} + {len(rok_jobs)} jobs")
Extracting Compensation Data
def extract_salary(text):
patterns = [
r"\$([\d,]+)k?\s*[-–to]+\s*\$?([\d,]+)k?",
r"([\d,]+)k?\s*[-–to]+\s*([\d,]+)k?\s*(?:USD|usd)",
r"(?:salary|comp|pay)[:\s]*\$?([\d,]+)k?\s*[-–to]+\s*\$?([\d,]+)k?"
]
for pattern in patterns:
match = re.search(pattern, text, re.I)
if match:
low = float(match.group(1).replace(",", ""))
high = float(match.group(2).replace(",", ""))
# Normalize to annual (handle "k" notation)
if low < 1000:
low *= 1000
if high < 1000:
high *= 1000
return {"min": low, "max": high, "mid": (low + high) / 2}
return None
def enrich_with_details(jobs, scraper_api_key):
enriched = []
for job in jobs[:50]: # Limit for rate limiting
if not job.get("url"):
continue
params = {
"api_key": scraper_api_key,
"url": job["url"],
"render": "true"
}
try:
response = requests.get("https://api.scraperapi.com", params=params)
soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text()
salary = extract_salary(text)
if salary:
job["salary_min"] = salary["min"]
job["salary_max"] = salary["max"]
job["salary_mid"] = salary["mid"]
# Extract requirements
job["requires_python"] = bool(re.search(r"\bpython\b", text, re.I))
job["requires_js"] = bool(re.search(r"\b(javascript|typescript|react|node)\b", text, re.I))
job["requires_cloud"] = bool(re.search(r"\b(aws|gcp|azure|cloud)\b", text, re.I))
job["years_exp"] = extract_experience(text)
enriched.append(job)
time.sleep(1)
except Exception as e:
print(f"Error: {e}")
return enriched
def extract_experience(text):
match = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)", text, re.I)
return int(match.group(1)) if match else None
Compensation Analysis
def analyze_compensation(df):
salary_df = df[df["salary_mid"] > 0].copy()
if salary_df.empty:
print("No salary data found")
return
print(f"\nRemote Work Compensation Analysis ({len(salary_df)} jobs with salary)")
print("=" * 55)
print(f"\nOverall Range:")
print(f" Median: ${salary_df['salary_mid'].median():,.0f}")
print(f" Mean: ${salary_df['salary_mid'].mean():,.0f}")
print(f" 25th percentile: ${salary_df['salary_mid'].quantile(0.25):,.0f}")
print(f" 75th percentile: ${salary_df['salary_mid'].quantile(0.75):,.0f}")
# By seniority (from title)
def classify_seniority(title):
title = title.lower()
if any(w in title for w in ["senior", "sr.", "lead", "principal", "staff"]):
return "Senior"
elif any(w in title for w in ["junior", "jr.", "entry", "associate"]):
return "Junior"
else:
return "Mid"
salary_df["seniority"] = salary_df["title"].apply(classify_seniority)
print("\nBy Seniority:")
for level in ["Junior", "Mid", "Senior"]:
subset = salary_df[salary_df["seniority"] == level]
if not subset.empty:
print(f" {level:8s}: ${subset['salary_mid'].median():>10,.0f} median "
f"({len(subset)} jobs)")
# Technology premiums
print("\nTechnology Salary Premiums:")
for tech, col in [("Python", "requires_python"),
("JavaScript", "requires_js"),
("Cloud", "requires_cloud")]:
if col in salary_df.columns:
with_tech = salary_df[salary_df[col]]["salary_mid"].median()
without = salary_df[~salary_df[col]]["salary_mid"].median()
if with_tech and without:
premium = ((with_tech - without) / without) * 100
print(f" {tech}: ${with_tech:,.0f} vs ${without:,.0f} ({premium:+.1f}%)")
all_jobs = wwr_jobs + rok_jobs
df = pd.DataFrame(all_jobs)
analyze_compensation(df)
Skills Demand Analysis
def skills_demand(df):
tech_keywords = {
"Python": r"\bpython\b",
"JavaScript": r"\bjavascript\b",
"TypeScript": r"\btypescript\b",
"React": r"\breact\b",
"Node.js": r"\bnode\.?js\b",
"Go": r"\bgolang\b|\bgo\b",
"Rust": r"\brust\b",
"AWS": r"\baws\b",
"Docker": r"\bdocker\b",
"Kubernetes": r"\bkubernetes\b|\bk8s\b",
"PostgreSQL": r"\bpostgres(ql)?\b",
}
counts = {}
for tech, pattern in tech_keywords.items():
mask = df["title"].str.contains(pattern, case=False, na=False)
counts[tech] = mask.sum()
sorted_techs = sorted(counts.items(), key=lambda x: x[1], reverse=True)
print("\nMost In-Demand Skills (by job title mentions):")
for tech, count in sorted_techs[:15]:
pct = count / len(df) * 100
bar = "█" * int(pct)
print(f" {tech:15s} {count:4d} ({pct:5.1f}%) {bar}")
skills_demand(df)
Scaling Your Job Scraping
- ScraperAPI handles anti-bot measures on major job boards
- ThorData provides residential proxies for accessing region-locked listings
- ScrapeOps tracks scraper performance and uptime
Conclusion
Remote job data reveals what the market actually pays — not what surveys claim. Build your scraper, collect data weekly, and you'll have a compensation database that helps you negotiate better, hire smarter, and spot emerging skill trends before they hit mainstream salary reports.
Top comments (0)