Job market data from multiple platforms reveals salary trends, in-demand skills, and hiring patterns that no single source can show. Here's how to build a multi-source job market tracker with Python.
Why Track Multiple Job Platforms?
- Salary intelligence: Compare compensation across sources
- Skill demand: Track which skills appear most in job postings
- Market timing: Identify hiring surges and freezes
- Geographic trends: See where jobs are concentrating
- Company analysis: Track which companies are hiring aggressively
Architecture: Multi-Source Aggregator
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from abc import ABC, abstractmethod
from datetime import datetime
class JobScraper(ABC):
"""Base class for job platform scrapers."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
})
@abstractmethod
def search_jobs(self, query, location, pages=1):
pass
@abstractmethod
def parse_listing(self, raw_data):
pass
class IndeedScraper(JobScraper):
BASE_URL = "https://www.indeed.com/jobs"
def search_jobs(self, query, location, pages=3):
results = []
for page in range(pages):
params = {
'q': query,
'l': location,
'start': page * 10,
}
resp = self.session.get(self.BASE_URL, params=params)
soup = BeautifulSoup(resp.text, 'html.parser')
cards = soup.select('.job_seen_beacon')
for card in cards:
parsed = self.parse_listing(card)
if parsed:
parsed['source'] = 'indeed'
results.append(parsed)
time.sleep(3)
return results
def parse_listing(self, card):
title = card.select_one('.jobTitle')
company = card.select_one('[data-testid="company-name"]')
location = card.select_one('[data-testid="text-location"]')
salary = card.select_one('.salary-snippet-container')
snippet = card.select_one('.job-snippet')
return {
'title': title.get_text(strip=True) if title else '',
'company': company.get_text(strip=True) if company else '',
'location': location.get_text(strip=True) if location else '',
'salary': salary.get_text(strip=True) if salary else '',
'description': snippet.get_text(strip=True) if snippet else '',
'collected_at': datetime.now().isoformat(),
}
Aggregating Across Platforms
class JobAggregator:
"""Combine data from multiple job platforms."""
def __init__(self):
self.scrapers = {
'indeed': IndeedScraper(),
}
self.all_jobs = []
def search_all(self, query, location, pages=2):
"""Run search across all platforms."""
for name, scraper in self.scrapers.items():
try:
jobs = scraper.search_jobs(query, location, pages)
self.all_jobs.extend(jobs)
print(f" {name}: {len(jobs)} listings found")
except Exception as e:
print(f" {name}: Error - {e}")
time.sleep(5) # Pause between platforms
return self.all_jobs
def deduplicate(self):
"""Remove duplicate listings across platforms."""
df = pd.DataFrame(self.all_jobs)
if df.empty:
return df
# Simple dedup on title + company
df['dedup_key'] = df['title'].str.lower() + '|' + df['company'].str.lower()
df = df.drop_duplicates(subset='dedup_key', keep='first')
return df.drop('dedup_key', axis=1)
Skill Demand Analysis
import re
from collections import Counter
TECH_SKILLS = [
'python', 'javascript', 'typescript', 'react', 'node.js', 'aws',
'docker', 'kubernetes', 'sql', 'postgresql', 'mongodb', 'redis',
'go', 'rust', 'java', 'c++', 'tensorflow', 'pytorch', 'llm',
'machine learning', 'data science', 'devops', 'ci/cd', 'terraform',
'graphql', 'rest api', 'microservices', 'agile', 'scrum',
]
def analyze_skill_demand(jobs_df):
"""Extract and rank in-demand skills from job listings."""
skill_counts = Counter()
total_jobs = len(jobs_df)
for _, job in jobs_df.iterrows():
text = f"{job.get('title', '')} {job.get('description', '')}".lower()
for skill in TECH_SKILLS:
if skill in text:
skill_counts[skill] += 1
print(f"Skill demand analysis ({total_jobs} listings):")
print("-" * 45)
for skill, count in skill_counts.most_common(20):
pct = count / total_jobs * 100
bar = "█" * int(pct / 2)
print(f" {skill:20s} {count:4d} ({pct:5.1f}%) {bar}")
return skill_counts
def analyze_salary_ranges(jobs_df):
"""Parse and analyze salary data from listings."""
salaries = []
for _, job in jobs_df.iterrows():
salary_text = job.get('salary', '')
if not salary_text:
continue
# Extract numeric values
numbers = re.findall(r'\$(\d{2,3}(?:,\d{3})*)', salary_text)
if numbers:
values = [int(n.replace(',', '')) for n in numbers]
salaries.append({
'title': job['title'],
'company': job['company'],
'min_salary': min(values),
'max_salary': max(values),
'mid_salary': sum(values) / len(values),
'source': job.get('source', ''),
})
if not salaries:
print("No salary data found in listings")
return pd.DataFrame()
df = pd.DataFrame(salaries)
print(f"\nSalary analysis ({len(df)} listings with salary data):")
print(f" Median salary: ${df['mid_salary'].median():,.0f}")
print(f" Average salary: ${df['mid_salary'].mean():,.0f}")
print(f" Range: ${df['min_salary'].min():,.0f} - ${df['max_salary'].max():,.0f}")
return df
Building the Full Pipeline
def run_job_market_tracker(queries, locations, output_dir='job_data'):
"""Full pipeline: collect, aggregate, analyze."""
import os
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d')
aggregator = JobAggregator()
# Collect across all queries and locations
for query in queries:
for location in locations:
print(f"\nSearching: '{query}' in {location}")
aggregator.search_all(query, location, pages=2)
time.sleep(5)
# Deduplicate and save
df = aggregator.deduplicate()
output_file = f"{output_dir}/jobs_{timestamp}.csv"
df.to_csv(output_file, index=False)
print(f"\nSaved {len(df)} unique listings to {output_file}")
# Analyze
print("\n" + "=" * 50)
analyze_skill_demand(df)
analyze_salary_ranges(df)
return df
# Example usage
df = run_job_market_tracker(
queries=['python developer', 'data engineer', 'ML engineer'],
locations=['San Francisco', 'New York', 'Remote'],
)
Scaling with Cloud Scrapers
For production-grade job market tracking, use cloud-based scrapers that handle anti-bot measures:
- LinkedIn Jobs: The LinkedIn Jobs Scraper on Apify collects job postings, salaries, and company data at scale
- Glassdoor: The Glassdoor Scraper on Apify extracts reviews, salaries, and interview data
These tools handle session management, proxy rotation, and data structuring — critical for platforms with aggressive bot detection.
For reliable proxy infrastructure across all job platforms, ThorData offers residential proxies optimized for high-volume data collection.
Conclusion
A multi-source job market tracker gives you intelligence that no single platform provides. By aggregating data from LinkedIn, Indeed, and Glassdoor, you can track real salary trends, identify in-demand skills, and spot hiring patterns before they become obvious. Start with the Python pipeline above, add cloud scrapers for scale, and run it on a schedule for continuous market intelligence.
Top comments (0)