Building a Job Salary Intelligence Tool with Python

#python #tutorial #webdev #programming

Building a Job Salary Intelligence Tool with Python

Salary transparency is increasing, but data is scattered across dozens of platforms. In this tutorial, we will build a Python tool that aggregates salary data from multiple sources to give you real intelligence for negotiations, hiring, or market research.

Architecture Overview

Our tool will:

Scrape salary data from public job postings
Normalize titles and compensation ranges
Store everything in a structured format
Generate reports with percentiles and trends

Setup

pip install requests pandas matplotlib

Scraping Job Postings with Salary Data

Many job boards now require salary ranges. Let us start by extracting them:

import requests
import re
import pandas as pd

def extract_salary_range(text):
    patterns = [
        r"\$([\d,]+)\s*[-to]+\s*\$([\d,]+)",
        r"([\d,]+)k\s*[-to]+\s*([\d,]+)k",
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            low = float(match.group(1).replace(",", ""))
            high = float(match.group(2).replace(",", ""))
            if low < 1000:
                low *= 1000
                high *= 1000
            return low, high
    return None, None

def scrape_salary_data(query, location="remote"):
    results = []
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

    url = f"https://remoteok.com/api?tag={query}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        jobs = response.json()
        for job in jobs[1:]:
            salary_min = job.get("salary_min")
            salary_max = job.get("salary_max")
            if salary_min and salary_max:
                results.append({
                    "title": job.get("position", ""),
                    "company": job.get("company", ""),
                    "salary_min": int(salary_min),
                    "salary_max": int(salary_max),
                    "salary_mid": (int(salary_min) + int(salary_max)) // 2,
                    "location": job.get("location", "Remote"),
                    "tags": ",".join(job.get("tags", [])),
                    "date": job.get("date", "")
                })

    return pd.DataFrame(results)

df = scrape_salary_data("python")
print(f"Found {len(df)} jobs with salary data")

Normalizing Job Titles

def normalize_title(title):
    title = title.lower().strip()
    mappings = {
        "sr": "senior", "jr": "junior", "dev": "developer",
        "eng": "engineer", "mgr": "manager", "swe": "software engineer"
    }
    for abbrev, full in mappings.items():
        title = re.sub(rf"\b{abbrev}\b", full, title)

    if "senior" in title:
        return "Senior"
    elif "junior" in title or "entry" in title:
        return "Junior"
    elif "lead" in title or "principal" in title or "staff" in title:
        return "Lead/Staff"
    return "Mid"

df["level"] = df["title"].apply(normalize_title)

Generating Salary Reports

import matplotlib.pyplot as plt

def salary_report(df):
    print("=== Salary Intelligence Report ===")
    print(f"Total jobs analyzed: {len(df)}")
    print(f"\nOverall Statistics:")
    print(f"  Median salary: ${df['salary_mid'].median():,.0f}")
    print(f"  25th percentile: ${df['salary_mid'].quantile(0.25):,.0f}")
    print(f"  75th percentile: ${df['salary_mid'].quantile(0.75):,.0f}")

    print(f"\nBy Level:")
    for level, group in df.groupby("level"):
        print(f"  {level}: ${group['salary_mid'].median():,.0f} median ({len(group)} jobs)")

    fig, ax = plt.subplots(figsize=(10, 6))
    df.boxplot(column="salary_mid", by="level", ax=ax)
    ax.set_ylabel("Annual Salary (USD)")
    ax.set_title("Salary Distribution by Level")
    plt.savefig("salary_report.png", dpi=150, bbox_inches="tight")
    print("\nChart saved to salary_report.png")

salary_report(df)

Scaling to Multiple Sources

Use ScraperAPI to scrape sites that require JavaScript rendering or block bots:

def fetch_with_scraperapi(url):
    params = {"api_key": "YOUR_KEY", "url": url, "render": "true"}
    return requests.get("https://api.scraperapi.com", params=params)

For scraping regional job boards, ThorData residential proxies help you access geo-restricted content. And ScrapeOps lets you monitor your scraper health and success rates across sources.

Automating Daily Collection

import schedule
import time
from datetime import datetime

def daily_collection():
    queries = ["python", "javascript", "devops", "data-science"]
    all_data = []
    for q in queries:
        df = scrape_salary_data(q)
        df["query"] = q
        df["collected_date"] = datetime.now().isoformat()
        all_data.append(df)

    combined = pd.concat(all_data, ignore_index=True)
    combined.to_csv(f"salaries_{datetime.now().strftime('%Y%m%d')}.csv", index=False)
    print(f"Collected {len(combined)} salary records")

schedule.every().day.at("09:00").do(daily_collection)

What You Can Build With This

Negotiation tool: Know your market rate before your next review
Hiring dashboard: Track competitor compensation in real-time
Trend reports: Identify which roles are seeing the fastest salary growth
Geographic arbitrage finder: Compare remote salaries across cost-of-living zones

Follow for more Python data engineering tutorials!

DEV Community

Building a Job Salary Intelligence Tool with Python