Building a Tech Salary Database with Job Posting Data
Salary transparency laws have changed the game. Many job postings now include compensation ranges, creating an unprecedented opportunity to build comprehensive salary databases. Let's scrape and analyze this data.
The Data Sources
Several job boards now display salary ranges thanks to laws in Colorado, NYC, California, and Washington. Remote job boards with salary transparency are particularly valuable.
Setting Up
pip install requests beautifulsoup4 pandas sqlalchemy
ScraperAPI handles JavaScript rendering and proxy rotation for job board scraping.
Scraping Job Postings with Salary Data
import requests
from bs4 import BeautifulSoup
import re
API_KEY = "YOUR_SCRAPERAPI_KEY"
def scrape_jobs(query, location="remote", page=1):
url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={page*10}"
params = {
"api_key": API_KEY,
"url": url,
"render": "true"
}
resp = requests.get("https://api.scraperapi.com", params=params)
soup = BeautifulSoup(resp.text, "html.parser")
jobs = []
for card in soup.select(".job_seen_beacon"):
title = card.select_one(".jobTitle")
company = card.select_one("[data-testid='company-name']")
salary = card.select_one(".salary-snippet-container")
if title and salary:
jobs.append({
"title": title.get_text(strip=True),
"company": company.get_text(strip=True) if company else None,
"salary_raw": salary.get_text(strip=True),
"location": location
})
return jobs
Parsing Salary Ranges
def parse_salary(raw):
raw = raw.replace(",", "").lower()
numbers = re.findall(r"\$?([\d.]+)k?", raw)
multiplier = 1000 if "k" in raw else 1
period = "annual" if any(w in raw for w in ["year", "annual"]) else "hourly" if "hour" in raw else "annual"
values = [float(n) * multiplier for n in numbers]
if len(values) >= 2:
return {"min": values[0], "max": values[1], "period": period}
elif len(values) == 1:
return {"min": values[0], "max": values[0], "period": period}
return None
print(parse_salary("$120K - $180K a year"))
# {'min': 120000.0, 'max': 180000.0, 'period': 'annual'}
Storing in SQLite
from sqlalchemy import create_engine, Column, String, Float, Integer
from sqlalchemy.orm import declarative_base, Session
engine = create_engine("sqlite:///salaries.db")
Base = declarative_base()
class SalaryRecord(Base):
__tablename__ = "salaries"
id = Column(Integer, primary_key=True)
title = Column(String)
company = Column(String)
salary_min = Column(Float)
salary_max = Column(Float)
period = Column(String)
location = Column(String)
Base.metadata.create_all(engine)
def store_jobs(jobs):
with Session(engine) as session:
for job in jobs:
parsed = parse_salary(job["salary_raw"])
if parsed:
record = SalaryRecord(
title=job["title"], company=job["company"],
salary_min=parsed["min"], salary_max=parsed["max"],
period=parsed["period"], location=job["location"]
)
session.add(record)
session.commit()
Analysis
import pandas as pd
df = pd.read_sql("salaries", engine)
df["salary_mid"] = (df["salary_min"] + df["salary_max"]) / 2
print(df.groupby("title")["salary_mid"].mean().sort_values(ascending=False).head(10))
df["salary_mid"].hist(bins=30, figsize=(10, 5))
For residential proxies at scale use ThorData, and monitor scraper health with ScrapeOps.
Key Takeaways
- Salary transparency laws created massive public compensation datasets
- Pattern matching extracts structured salary data from varied formats
- SQLite provides a lightweight but queryable storage layer
- Regular scraping builds longitudinal salary trend data
Respect robots.txt and rate limits. This data is publicly posted but should be collected responsibly.
Top comments (0)