Building a Job Salary Intelligence Tool with Python
Salary transparency is increasing, but data is scattered across dozens of platforms. In this tutorial, we will build a Python tool that aggregates salary data from multiple sources to give you real intelligence for negotiations, hiring, or market research.
Architecture Overview
Our tool will:
- Scrape salary data from public job postings
- Normalize titles and compensation ranges
- Store everything in a structured format
- Generate reports with percentiles and trends
Setup
pip install requests pandas matplotlib
Scraping Job Postings with Salary Data
Many job boards now require salary ranges. Let us start by extracting them:
import requests
import re
import pandas as pd
def extract_salary_range(text):
patterns = [
r"\$([\d,]+)\s*[-to]+\s*\$([\d,]+)",
r"([\d,]+)k\s*[-to]+\s*([\d,]+)k",
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
low = float(match.group(1).replace(",", ""))
high = float(match.group(2).replace(",", ""))
if low < 1000:
low *= 1000
high *= 1000
return low, high
return None, None
def scrape_salary_data(query, location="remote"):
results = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
url = f"https://remoteok.com/api?tag={query}"
response = requests.get(url, headers=headers)
if response.status_code == 200:
jobs = response.json()
for job in jobs[1:]:
salary_min = job.get("salary_min")
salary_max = job.get("salary_max")
if salary_min and salary_max:
results.append({
"title": job.get("position", ""),
"company": job.get("company", ""),
"salary_min": int(salary_min),
"salary_max": int(salary_max),
"salary_mid": (int(salary_min) + int(salary_max)) // 2,
"location": job.get("location", "Remote"),
"tags": ",".join(job.get("tags", [])),
"date": job.get("date", "")
})
return pd.DataFrame(results)
df = scrape_salary_data("python")
print(f"Found {len(df)} jobs with salary data")
Normalizing Job Titles
def normalize_title(title):
title = title.lower().strip()
mappings = {
"sr": "senior", "jr": "junior", "dev": "developer",
"eng": "engineer", "mgr": "manager", "swe": "software engineer"
}
for abbrev, full in mappings.items():
title = re.sub(rf"\b{abbrev}\b", full, title)
if "senior" in title:
return "Senior"
elif "junior" in title or "entry" in title:
return "Junior"
elif "lead" in title or "principal" in title or "staff" in title:
return "Lead/Staff"
return "Mid"
df["level"] = df["title"].apply(normalize_title)
Generating Salary Reports
import matplotlib.pyplot as plt
def salary_report(df):
print("=== Salary Intelligence Report ===")
print(f"Total jobs analyzed: {len(df)}")
print(f"\nOverall Statistics:")
print(f" Median salary: ${df['salary_mid'].median():,.0f}")
print(f" 25th percentile: ${df['salary_mid'].quantile(0.25):,.0f}")
print(f" 75th percentile: ${df['salary_mid'].quantile(0.75):,.0f}")
print(f"\nBy Level:")
for level, group in df.groupby("level"):
print(f" {level}: ${group['salary_mid'].median():,.0f} median ({len(group)} jobs)")
fig, ax = plt.subplots(figsize=(10, 6))
df.boxplot(column="salary_mid", by="level", ax=ax)
ax.set_ylabel("Annual Salary (USD)")
ax.set_title("Salary Distribution by Level")
plt.savefig("salary_report.png", dpi=150, bbox_inches="tight")
print("\nChart saved to salary_report.png")
salary_report(df)
Scaling to Multiple Sources
Use ScraperAPI to scrape sites that require JavaScript rendering or block bots:
def fetch_with_scraperapi(url):
params = {"api_key": "YOUR_KEY", "url": url, "render": "true"}
return requests.get("https://api.scraperapi.com", params=params)
For scraping regional job boards, ThorData residential proxies help you access geo-restricted content. And ScrapeOps lets you monitor your scraper health and success rates across sources.
Automating Daily Collection
import schedule
import time
from datetime import datetime
def daily_collection():
queries = ["python", "javascript", "devops", "data-science"]
all_data = []
for q in queries:
df = scrape_salary_data(q)
df["query"] = q
df["collected_date"] = datetime.now().isoformat()
all_data.append(df)
combined = pd.concat(all_data, ignore_index=True)
combined.to_csv(f"salaries_{datetime.now().strftime('%Y%m%d')}.csv", index=False)
print(f"Collected {len(combined)} salary records")
schedule.every().day.at("09:00").do(daily_collection)
What You Can Build With This
- Negotiation tool: Know your market rate before your next review
- Hiring dashboard: Track competitor compensation in real-time
- Trend reports: Identify which roles are seeing the fastest salary growth
- Geographic arbitrage finder: Compare remote salaries across cost-of-living zones
Follow for more Python data engineering tutorials!
Top comments (0)