DEV Community

agenthustler
agenthustler

Posted on

Building a Job Board Aggregator: Indeed, LinkedIn, and Glassdoor

Why Build a Job Board Aggregator?

Job seekers waste hours checking multiple platforms daily. Recruiters need market intelligence across boards. A job aggregator solves both problems — one API, all listings, structured data.

Let's build a Python aggregator that pulls from Indeed, LinkedIn, and Glassdoor.

Architecture

Our aggregator follows a plugin pattern where each job board gets its own scraper class:

from abc import ABC, abstractmethod
from dataclasses import dataclass, asdict
from typing import List, Optional
import json

@dataclass
class JobListing:
    title: str
    company: str
    location: str
    salary: Optional[str]
    url: str
    source: str
    description: Optional[str] = None
    posted_date: Optional[str] = None

class JobScraper(ABC):
    @abstractmethod
    def search(self, query: str, location: str, pages: int = 1) -> List[JobListing]:
        pass
Enter fullscreen mode Exit fullscreen mode

Indeed Scraper

Indeed is the largest job board. Their listings are rendered server-side, making them relatively easy to parse:

import requests
from bs4 import BeautifulSoup
import time

class IndeedScraper(JobScraper):
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.scraperapi.com"

    def search(self, query, location, pages=1):
        listings = []
        for page in range(pages):
            url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={page * 10}"
            html = self._fetch(url)
            listings.extend(self._parse(html))
            time.sleep(2)
        return listings

    def _fetch(self, url):
        resp = requests.get(self.base_url, params={
            "api_key": self.api_key,
            "url": url,
            "render": "true"
        })
        return resp.text

    def _parse(self, html):
        soup = BeautifulSoup(html, "html.parser")
        jobs = []

        for card in soup.select(".job_seen_beacon, .jobsearch-ResultsList > li"):
            title_el = card.select_one(".jobTitle a, h2.jobTitle")
            company_el = card.select_one(".companyName, [data-testid=company-name]")
            location_el = card.select_one(".companyLocation, [data-testid=text-location]")
            salary_el = card.select_one(".salary-snippet, .estimatedSalary")

            if title_el and company_el:
                link = title_el.get("href", "")
                if link.startswith("/"):
                    link = f"https://www.indeed.com{link}"

                jobs.append(JobListing(
                    title=title_el.get_text(strip=True),
                    company=company_el.get_text(strip=True),
                    location=location_el.get_text(strip=True) if location_el else "",
                    salary=salary_el.get_text(strip=True) if salary_el else None,
                    url=link,
                    source="indeed"
                ))
        return jobs
Enter fullscreen mode Exit fullscreen mode

LinkedIn Scraper

LinkedIn's public job listings do not require authentication:

class LinkedInScraper(JobScraper):
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.scraperapi.com"

    def search(self, query, location, pages=1):
        listings = []
        for page in range(pages):
            start = page * 25
            url = f"https://www.linkedin.com/jobs/search?keywords={query}&location={location}&start={start}"
            html = self._fetch(url)
            listings.extend(self._parse(html))
            time.sleep(3)  # LinkedIn is aggressive with rate limiting
        return listings

    def _fetch(self, url):
        resp = requests.get(self.base_url, params={
            "api_key": self.api_key,
            "url": url,
            "render": "true"
        })
        return resp.text

    def _parse(self, html):
        soup = BeautifulSoup(html, "html.parser")
        jobs = []

        for card in soup.select(".base-card, .job-search-card"):
            title_el = card.select_one(".base-search-card__title")
            company_el = card.select_one(".base-search-card__subtitle")
            location_el = card.select_one(".job-search-card__location")
            link_el = card.select_one("a.base-card__full-link")

            if title_el and company_el:
                jobs.append(JobListing(
                    title=title_el.get_text(strip=True),
                    company=company_el.get_text(strip=True),
                    location=location_el.get_text(strip=True) if location_el else "",
                    salary=None,
                    url=link_el.get("href", "") if link_el else "",
                    source="linkedin"
                ))
        return jobs
Enter fullscreen mode Exit fullscreen mode

The Aggregator

Combine all scrapers into a unified interface:

class JobAggregator:
    def __init__(self, api_key):
        self.scrapers = {
            "indeed": IndeedScraper(api_key),
            "linkedin": LinkedInScraper(api_key),
        }

    def search_all(self, query, location, pages=1):
        all_jobs = []
        for name, scraper in self.scrapers.items():
            print(f"Searching {name}...")
            try:
                jobs = scraper.search(query, location, pages)
                all_jobs.extend(jobs)
                print(f"  Found {len(jobs)} jobs")
            except Exception as e:
                print(f"  Error: {e}")

        # Deduplicate by title + company
        seen = set()
        unique = []
        for job in all_jobs:
            key = (job.title.lower(), job.company.lower())
            if key not in seen:
                seen.add(key)
                unique.append(job)

        return unique

    def export_json(self, jobs, filename):
        with open(filename, "w") as f:
            json.dump([asdict(j) for j in jobs], f, indent=2)
        print(f"Exported {len(jobs)} jobs to {filename}")

# Usage
agg = JobAggregator(api_key="YOUR_SCRAPERAPI_KEY")
jobs = agg.search_all("python developer", "San Francisco", pages=3)
agg.export_json(jobs, "sf_python_jobs.json")
Enter fullscreen mode Exit fullscreen mode

Handling Anti-Bot Protection

Job boards use aggressive bot detection:

  • ScraperAPI handles CAPTCHAs and IP rotation automatically
  • ThorData residential proxies help avoid IP blocks
  • Rate limiting is essential — never hit more than 1 request per 2-3 seconds

Monitoring

Track your scraper health with ScrapeOps. Job boards frequently change their HTML structure, and ScrapeOps alerts you when success rates drop.

Taking It Further

  • Add email alerts for new postings matching criteria
  • Build a simple web dashboard with Flask/FastAPI
  • Track salary trends over time
  • Add filters for remote-only, seniority level, etc.
  • Store in PostgreSQL for advanced querying

Conclusion

A job board aggregator is a practical project with real users. Whether for personal job searching or building a commercial product, the ability to unify job data across platforms creates significant value.

Top comments (0)