Scraping Remote Work Job Listings: Compensation and Requirements Analysis

#python #tutorial #webdev #programming

Remote work job boards contain rich data about compensation, skill requirements, and hiring trends. Scraping this data systematically reveals what the market actually pays and demands.

Why Scrape Remote Job Data?

Salary surveys are self-reported and biased. Actual job postings show what companies offer right now. Scraping multiple boards gives you the most accurate picture of remote work compensation and requirements.

Setup

pip install requests beautifulsoup4 pandas

Scraping Remote Job Boards

Here's a scraper framework for remote job boards:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

def scrape_remote_jobs(category="software-dev", pages=5):
    jobs = []

    for page in range(1, pages + 1):
        params = {
            "api_key": "YOUR_SCRAPERAPI_KEY",
            "url": f"https://weworkremotely.com/categories/{category}?page={page}",
            "render": "false"
        }

        response = requests.get("https://api.scraperapi.com", params=params)
        soup = BeautifulSoup(response.text, "html.parser")

        for listing in soup.select(".feature"):
            company_el = listing.select_one(".company")
            title_el = listing.select_one(".title")
            region_el = listing.select_one(".region")
            link = listing.select_one("a[href*='/remote-jobs/']")

            jobs.append({
                "company": company_el.text.strip() if company_el else "",
                "title": title_el.text.strip() if title_el else "",
                "region": region_el.text.strip() if region_el else "Anywhere",
                "url": f"https://weworkremotely.com{link['href']}" if link else "",
                "source": "WeWorkRemotely"
            })

        time.sleep(1)

    return jobs

def scrape_remoteok():
    params = {
        "api_key": "YOUR_SCRAPERAPI_KEY",
        "url": "https://remoteok.com/api",
    }

    response = requests.get("https://api.scraperapi.com", params=params)

    try:
        data = response.json()
    except:
        return []

    jobs = []
    for item in data[1:]:  # First item is metadata
        salary_min = item.get("salary_min", 0)
        salary_max = item.get("salary_max", 0)

        jobs.append({
            "company": item.get("company", ""),
            "title": item.get("position", ""),
            "tags": item.get("tags", []),
            "salary_min": salary_min,
            "salary_max": salary_max,
            "location": item.get("location", "Remote"),
            "url": item.get("url", ""),
            "date": item.get("date", ""),
            "source": "RemoteOK"
        })

    return jobs

wwr_jobs = scrape_remote_jobs("software-dev", pages=5)
rok_jobs = scrape_remoteok()
print(f"Collected {len(wwr_jobs)} + {len(rok_jobs)} jobs")

Extracting Compensation Data

def extract_salary(text):
    patterns = [
        r"\$([\d,]+)k?\s*[-–to]+\s*\$?([\d,]+)k?",
        r"([\d,]+)k?\s*[-–to]+\s*([\d,]+)k?\s*(?:USD|usd)",
        r"(?:salary|comp|pay)[:\s]*\$?([\d,]+)k?\s*[-–to]+\s*\$?([\d,]+)k?"
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.I)
        if match:
            low = float(match.group(1).replace(",", ""))
            high = float(match.group(2).replace(",", ""))

            # Normalize to annual (handle "k" notation)
            if low < 1000:
                low *= 1000
            if high < 1000:
                high *= 1000

            return {"min": low, "max": high, "mid": (low + high) / 2}

    return None

def enrich_with_details(jobs, scraper_api_key):
    enriched = []

    for job in jobs[:50]:  # Limit for rate limiting
        if not job.get("url"):
            continue

        params = {
            "api_key": scraper_api_key,
            "url": job["url"],
            "render": "true"
        }

        try:
            response = requests.get("https://api.scraperapi.com", params=params)
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text()

            salary = extract_salary(text)
            if salary:
                job["salary_min"] = salary["min"]
                job["salary_max"] = salary["max"]
                job["salary_mid"] = salary["mid"]

            # Extract requirements
            job["requires_python"] = bool(re.search(r"\bpython\b", text, re.I))
            job["requires_js"] = bool(re.search(r"\b(javascript|typescript|react|node)\b", text, re.I))
            job["requires_cloud"] = bool(re.search(r"\b(aws|gcp|azure|cloud)\b", text, re.I))
            job["years_exp"] = extract_experience(text)

            enriched.append(job)
            time.sleep(1)
        except Exception as e:
            print(f"Error: {e}")

    return enriched

def extract_experience(text):
    match = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)", text, re.I)
    return int(match.group(1)) if match else None

Compensation Analysis

def analyze_compensation(df):
    salary_df = df[df["salary_mid"] > 0].copy()

    if salary_df.empty:
        print("No salary data found")
        return

    print(f"\nRemote Work Compensation Analysis ({len(salary_df)} jobs with salary)")
    print("=" * 55)

    print(f"\nOverall Range:")
    print(f"  Median: ${salary_df['salary_mid'].median():,.0f}")
    print(f"  Mean:   ${salary_df['salary_mid'].mean():,.0f}")
    print(f"  25th percentile: ${salary_df['salary_mid'].quantile(0.25):,.0f}")
    print(f"  75th percentile: ${salary_df['salary_mid'].quantile(0.75):,.0f}")

    # By seniority (from title)
    def classify_seniority(title):
        title = title.lower()
        if any(w in title for w in ["senior", "sr.", "lead", "principal", "staff"]):
            return "Senior"
        elif any(w in title for w in ["junior", "jr.", "entry", "associate"]):
            return "Junior"
        else:
            return "Mid"

    salary_df["seniority"] = salary_df["title"].apply(classify_seniority)

    print("\nBy Seniority:")
    for level in ["Junior", "Mid", "Senior"]:
        subset = salary_df[salary_df["seniority"] == level]
        if not subset.empty:
            print(f"  {level:8s}: ${subset['salary_mid'].median():>10,.0f} median "
                  f"({len(subset)} jobs)")

    # Technology premiums
    print("\nTechnology Salary Premiums:")
    for tech, col in [("Python", "requires_python"), 
                       ("JavaScript", "requires_js"),
                       ("Cloud", "requires_cloud")]:
        if col in salary_df.columns:
            with_tech = salary_df[salary_df[col]]["salary_mid"].median()
            without = salary_df[~salary_df[col]]["salary_mid"].median()
            if with_tech and without:
                premium = ((with_tech - without) / without) * 100
                print(f"  {tech}: ${with_tech:,.0f} vs ${without:,.0f} ({premium:+.1f}%)")

all_jobs = wwr_jobs + rok_jobs
df = pd.DataFrame(all_jobs)
analyze_compensation(df)

Skills Demand Analysis

def skills_demand(df):
    tech_keywords = {
        "Python": r"\bpython\b",
        "JavaScript": r"\bjavascript\b",
        "TypeScript": r"\btypescript\b",
        "React": r"\breact\b",
        "Node.js": r"\bnode\.?js\b",
        "Go": r"\bgolang\b|\bgo\b",
        "Rust": r"\brust\b",
        "AWS": r"\baws\b",
        "Docker": r"\bdocker\b",
        "Kubernetes": r"\bkubernetes\b|\bk8s\b",
        "PostgreSQL": r"\bpostgres(ql)?\b",
    }

    counts = {}
    for tech, pattern in tech_keywords.items():
        mask = df["title"].str.contains(pattern, case=False, na=False)
        counts[tech] = mask.sum()

    sorted_techs = sorted(counts.items(), key=lambda x: x[1], reverse=True)

    print("\nMost In-Demand Skills (by job title mentions):")
    for tech, count in sorted_techs[:15]:
        pct = count / len(df) * 100
        bar = "█" * int(pct)
        print(f"  {tech:15s} {count:4d} ({pct:5.1f}%) {bar}")

skills_demand(df)

Scaling Your Job Scraping

ScraperAPI handles anti-bot measures on major job boards
ThorData provides residential proxies for accessing region-locked listings
ScrapeOps tracks scraper performance and uptime

Conclusion

Remote job data reveals what the market actually pays — not what surveys claim. Build your scraper, collect data weekly, and you'll have a compensation database that helps you negotiate better, hire smarter, and spot emerging skill trends before they hit mainstream salary reports.