Building a Job Market Tracker: Aggregate LinkedIn, Indeed, and Glassdoor Data

#python #webdev #tutorial #webscraping

Job market data from multiple platforms reveals salary trends, in-demand skills, and hiring patterns that no single source can show. Here's how to build a multi-source job market tracker with Python.

Why Track Multiple Job Platforms?

Salary intelligence: Compare compensation across sources
Skill demand: Track which skills appear most in job postings
Market timing: Identify hiring surges and freezes
Geographic trends: See where jobs are concentrating
Company analysis: Track which companies are hiring aggressively

Architecture: Multi-Source Aggregator

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Aggregating Across Platforms

class JobAggregator:
    """Combine data from multiple job platforms."""

    def __init__(self):
        self.scrapers = {
            'indeed': IndeedScraper(),
        }
        self.all_jobs = []

    def search_all(self, query, location, pages=2):
        """Run search across all platforms."""
        for name, scraper in self.scrapers.items():
            try:
                jobs = scraper.search_jobs(query, location, pages)
                self.all_jobs.extend(jobs)
                print(f"  {name}: {len(jobs)} listings found")
            except Exception as e:
                print(f"  {name}: Error - {e}")
            time.sleep(5)  # Pause between platforms

        return self.all_jobs

    def deduplicate(self):
        """Remove duplicate listings across platforms."""
        df = pd.DataFrame(self.all_jobs)
        if df.empty:
            return df

        # Simple dedup on title + company
        df['dedup_key'] = df['title'].str.lower() + '|' + df['company'].str.lower()
        df = df.drop_duplicates(subset='dedup_key', keep='first')
        return df.drop('dedup_key', axis=1)

Skill Demand Analysis

import re
from collections import Counter

TECH_SKILLS = [
    'python', 'javascript', 'typescript', 'react', 'node.js', 'aws',
    'docker', 'kubernetes', 'sql', 'postgresql', 'mongodb', 'redis',
    'go', 'rust', 'java', 'c++', 'tensorflow', 'pytorch', 'llm',
    'machine learning', 'data science', 'devops', 'ci/cd', 'terraform',
    'graphql', 'rest api', 'microservices', 'agile', 'scrum',
]

def analyze_skill_demand(jobs_df):
    """Extract and rank in-demand skills from job listings."""
    skill_counts = Counter()
    total_jobs = len(jobs_df)

    for _, job in jobs_df.iterrows():
        text = f"{job.get('title', '')} {job.get('description', '')}".lower()
        for skill in TECH_SKILLS:
            if skill in text:
                skill_counts[skill] += 1

    print(f"Skill demand analysis ({total_jobs} listings):")
    print("-" * 45)
    for skill, count in skill_counts.most_common(20):
        pct = count / total_jobs * 100
        bar = "█" * int(pct / 2)
        print(f"  {skill:20s} {count:4d} ({pct:5.1f}%) {bar}")

    return skill_counts


def analyze_salary_ranges(jobs_df):
    """Parse and analyze salary data from listings."""
    salaries = []
    for _, job in jobs_df.iterrows():
        salary_text = job.get('salary', '')
        if not salary_text:
            continue

        # Extract numeric values
        numbers = re.findall(r'\$(\d{2,3}(?:,\d{3})*)', salary_text)
        if numbers:
            values = [int(n.replace(',', '')) for n in numbers]
            salaries.append({
                'title': job['title'],
                'company': job['company'],
                'min_salary': min(values),
                'max_salary': max(values),
                'mid_salary': sum(values) / len(values),
                'source': job.get('source', ''),
            })

    if not salaries:
        print("No salary data found in listings")
        return pd.DataFrame()

    df = pd.DataFrame(salaries)
    print(f"\nSalary analysis ({len(df)} listings with salary data):")
    print(f"  Median salary: ${df['mid_salary'].median():,.0f}")
    print(f"  Average salary: ${df['mid_salary'].mean():,.0f}")
    print(f"  Range: ${df['min_salary'].min():,.0f} - ${df['max_salary'].max():,.0f}")

    return df

Building the Full Pipeline

def run_job_market_tracker(queries, locations, output_dir='job_data'):
    """Full pipeline: collect, aggregate, analyze."""
    import os
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d')

    aggregator = JobAggregator()

    # Collect across all queries and locations
    for query in queries:
        for location in locations:
            print(f"\nSearching: '{query}' in {location}")
            aggregator.search_all(query, location, pages=2)
            time.sleep(5)

    # Deduplicate and save
    df = aggregator.deduplicate()
    output_file = f"{output_dir}/jobs_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    print(f"\nSaved {len(df)} unique listings to {output_file}")

    # Analyze
    print("\n" + "=" * 50)
    analyze_skill_demand(df)
    analyze_salary_ranges(df)

    return df

# Example usage
df = run_job_market_tracker(
    queries=['python developer', 'data engineer', 'ML engineer'],
    locations=['San Francisco', 'New York', 'Remote'],
)

Scaling with Cloud Scrapers

For production-grade job market tracking, use cloud-based scrapers that handle anti-bot measures:

LinkedIn Jobs: The LinkedIn Jobs Scraper on Apify collects job postings, salaries, and company data at scale
Glassdoor: The Glassdoor Scraper on Apify extracts reviews, salaries, and interview data

These tools handle session management, proxy rotation, and data structuring — critical for platforms with aggressive bot detection.

For reliable proxy infrastructure across all job platforms, ThorData offers residential proxies optimized for high-volume data collection.

Conclusion

A multi-source job market tracker gives you intelligence that no single platform provides. By aggregating data from LinkedIn, Indeed, and Glassdoor, you can track real salary trends, identify in-demand skills, and spot hiring patterns before they become obvious. Start with the Python pipeline above, add cloud scrapers for scale, and run it on a schedule for continuous market intelligence.