DEV Community

agenthustler
agenthustler

Posted on

Building a Job Market Tracker: Aggregate LinkedIn, Indeed, and Glassdoor Data

Job market data from multiple platforms reveals salary trends, in-demand skills, and hiring patterns that no single source can show. Here's how to build a multi-source job market tracker with Python.

Why Track Multiple Job Platforms?

  • Salary intelligence: Compare compensation across sources
  • Skill demand: Track which skills appear most in job postings
  • Market timing: Identify hiring surges and freezes
  • Geographic trends: See where jobs are concentrating
  • Company analysis: Track which companies are hiring aggressively

Architecture: Multi-Source Aggregator

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from abc import ABC, abstractmethod
from datetime import datetime

class JobScraper(ABC):
    """Base class for job platform scrapers."""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        })

    @abstractmethod
    def search_jobs(self, query, location, pages=1):
        pass

    @abstractmethod
    def parse_listing(self, raw_data):
        pass


class IndeedScraper(JobScraper):
    BASE_URL = "https://www.indeed.com/jobs"

    def search_jobs(self, query, location, pages=3):
        results = []
        for page in range(pages):
            params = {
                'q': query,
                'l': location,
                'start': page * 10,
            }
            resp = self.session.get(self.BASE_URL, params=params)
            soup = BeautifulSoup(resp.text, 'html.parser')

            cards = soup.select('.job_seen_beacon')
            for card in cards:
                parsed = self.parse_listing(card)
                if parsed:
                    parsed['source'] = 'indeed'
                    results.append(parsed)

            time.sleep(3)
        return results

    def parse_listing(self, card):
        title = card.select_one('.jobTitle')
        company = card.select_one('[data-testid="company-name"]')
        location = card.select_one('[data-testid="text-location"]')
        salary = card.select_one('.salary-snippet-container')
        snippet = card.select_one('.job-snippet')

        return {
            'title': title.get_text(strip=True) if title else '',
            'company': company.get_text(strip=True) if company else '',
            'location': location.get_text(strip=True) if location else '',
            'salary': salary.get_text(strip=True) if salary else '',
            'description': snippet.get_text(strip=True) if snippet else '',
            'collected_at': datetime.now().isoformat(),
        }
Enter fullscreen mode Exit fullscreen mode

Aggregating Across Platforms

class JobAggregator:
    """Combine data from multiple job platforms."""

    def __init__(self):
        self.scrapers = {
            'indeed': IndeedScraper(),
        }
        self.all_jobs = []

    def search_all(self, query, location, pages=2):
        """Run search across all platforms."""
        for name, scraper in self.scrapers.items():
            try:
                jobs = scraper.search_jobs(query, location, pages)
                self.all_jobs.extend(jobs)
                print(f"  {name}: {len(jobs)} listings found")
            except Exception as e:
                print(f"  {name}: Error - {e}")
            time.sleep(5)  # Pause between platforms

        return self.all_jobs

    def deduplicate(self):
        """Remove duplicate listings across platforms."""
        df = pd.DataFrame(self.all_jobs)
        if df.empty:
            return df

        # Simple dedup on title + company
        df['dedup_key'] = df['title'].str.lower() + '|' + df['company'].str.lower()
        df = df.drop_duplicates(subset='dedup_key', keep='first')
        return df.drop('dedup_key', axis=1)
Enter fullscreen mode Exit fullscreen mode

Skill Demand Analysis

import re
from collections import Counter

TECH_SKILLS = [
    'python', 'javascript', 'typescript', 'react', 'node.js', 'aws',
    'docker', 'kubernetes', 'sql', 'postgresql', 'mongodb', 'redis',
    'go', 'rust', 'java', 'c++', 'tensorflow', 'pytorch', 'llm',
    'machine learning', 'data science', 'devops', 'ci/cd', 'terraform',
    'graphql', 'rest api', 'microservices', 'agile', 'scrum',
]

def analyze_skill_demand(jobs_df):
    """Extract and rank in-demand skills from job listings."""
    skill_counts = Counter()
    total_jobs = len(jobs_df)

    for _, job in jobs_df.iterrows():
        text = f"{job.get('title', '')} {job.get('description', '')}".lower()
        for skill in TECH_SKILLS:
            if skill in text:
                skill_counts[skill] += 1

    print(f"Skill demand analysis ({total_jobs} listings):")
    print("-" * 45)
    for skill, count in skill_counts.most_common(20):
        pct = count / total_jobs * 100
        bar = "" * int(pct / 2)
        print(f"  {skill:20s} {count:4d} ({pct:5.1f}%) {bar}")

    return skill_counts


def analyze_salary_ranges(jobs_df):
    """Parse and analyze salary data from listings."""
    salaries = []
    for _, job in jobs_df.iterrows():
        salary_text = job.get('salary', '')
        if not salary_text:
            continue

        # Extract numeric values
        numbers = re.findall(r'\$(\d{2,3}(?:,\d{3})*)', salary_text)
        if numbers:
            values = [int(n.replace(',', '')) for n in numbers]
            salaries.append({
                'title': job['title'],
                'company': job['company'],
                'min_salary': min(values),
                'max_salary': max(values),
                'mid_salary': sum(values) / len(values),
                'source': job.get('source', ''),
            })

    if not salaries:
        print("No salary data found in listings")
        return pd.DataFrame()

    df = pd.DataFrame(salaries)
    print(f"\nSalary analysis ({len(df)} listings with salary data):")
    print(f"  Median salary: ${df['mid_salary'].median():,.0f}")
    print(f"  Average salary: ${df['mid_salary'].mean():,.0f}")
    print(f"  Range: ${df['min_salary'].min():,.0f} - ${df['max_salary'].max():,.0f}")

    return df
Enter fullscreen mode Exit fullscreen mode

Building the Full Pipeline

def run_job_market_tracker(queries, locations, output_dir='job_data'):
    """Full pipeline: collect, aggregate, analyze."""
    import os
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d')

    aggregator = JobAggregator()

    # Collect across all queries and locations
    for query in queries:
        for location in locations:
            print(f"\nSearching: '{query}' in {location}")
            aggregator.search_all(query, location, pages=2)
            time.sleep(5)

    # Deduplicate and save
    df = aggregator.deduplicate()
    output_file = f"{output_dir}/jobs_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    print(f"\nSaved {len(df)} unique listings to {output_file}")

    # Analyze
    print("\n" + "=" * 50)
    analyze_skill_demand(df)
    analyze_salary_ranges(df)

    return df

# Example usage
df = run_job_market_tracker(
    queries=['python developer', 'data engineer', 'ML engineer'],
    locations=['San Francisco', 'New York', 'Remote'],
)
Enter fullscreen mode Exit fullscreen mode

Scaling with Cloud Scrapers

For production-grade job market tracking, use cloud-based scrapers that handle anti-bot measures:

These tools handle session management, proxy rotation, and data structuring — critical for platforms with aggressive bot detection.

For reliable proxy infrastructure across all job platforms, ThorData offers residential proxies optimized for high-volume data collection.

Conclusion

A multi-source job market tracker gives you intelligence that no single platform provides. By aggregating data from LinkedIn, Indeed, and Glassdoor, you can track real salary trends, identify in-demand skills, and spot hiring patterns before they become obvious. Start with the Python pipeline above, add cloud scrapers for scale, and run it on a schedule for continuous market intelligence.

Top comments (0)