Why Build a Job Board Aggregator?
Job seekers waste hours checking multiple platforms daily. Recruiters need market intelligence across boards. A job aggregator solves both problems — one API, all listings, structured data.
Let's build a Python aggregator that pulls from Indeed, LinkedIn, and Glassdoor.
Architecture
Our aggregator follows a plugin pattern where each job board gets its own scraper class:
from abc import ABC, abstractmethod
from dataclasses import dataclass, asdict
from typing import List, Optional
import json
@dataclass
class JobListing:
title: str
company: str
location: str
salary: Optional[str]
url: str
source: str
description: Optional[str] = None
posted_date: Optional[str] = None
class JobScraper(ABC):
@abstractmethod
def search(self, query: str, location: str, pages: int = 1) -> List[JobListing]:
pass
Indeed Scraper
Indeed is the largest job board. Their listings are rendered server-side, making them relatively easy to parse:
import requests
from bs4 import BeautifulSoup
import time
class IndeedScraper(JobScraper):
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.scraperapi.com"
def search(self, query, location, pages=1):
listings = []
for page in range(pages):
url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={page * 10}"
html = self._fetch(url)
listings.extend(self._parse(html))
time.sleep(2)
return listings
def _fetch(self, url):
resp = requests.get(self.base_url, params={
"api_key": self.api_key,
"url": url,
"render": "true"
})
return resp.text
def _parse(self, html):
soup = BeautifulSoup(html, "html.parser")
jobs = []
for card in soup.select(".job_seen_beacon, .jobsearch-ResultsList > li"):
title_el = card.select_one(".jobTitle a, h2.jobTitle")
company_el = card.select_one(".companyName, [data-testid=company-name]")
location_el = card.select_one(".companyLocation, [data-testid=text-location]")
salary_el = card.select_one(".salary-snippet, .estimatedSalary")
if title_el and company_el:
link = title_el.get("href", "")
if link.startswith("/"):
link = f"https://www.indeed.com{link}"
jobs.append(JobListing(
title=title_el.get_text(strip=True),
company=company_el.get_text(strip=True),
location=location_el.get_text(strip=True) if location_el else "",
salary=salary_el.get_text(strip=True) if salary_el else None,
url=link,
source="indeed"
))
return jobs
LinkedIn Scraper
LinkedIn's public job listings do not require authentication:
class LinkedInScraper(JobScraper):
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.scraperapi.com"
def search(self, query, location, pages=1):
listings = []
for page in range(pages):
start = page * 25
url = f"https://www.linkedin.com/jobs/search?keywords={query}&location={location}&start={start}"
html = self._fetch(url)
listings.extend(self._parse(html))
time.sleep(3) # LinkedIn is aggressive with rate limiting
return listings
def _fetch(self, url):
resp = requests.get(self.base_url, params={
"api_key": self.api_key,
"url": url,
"render": "true"
})
return resp.text
def _parse(self, html):
soup = BeautifulSoup(html, "html.parser")
jobs = []
for card in soup.select(".base-card, .job-search-card"):
title_el = card.select_one(".base-search-card__title")
company_el = card.select_one(".base-search-card__subtitle")
location_el = card.select_one(".job-search-card__location")
link_el = card.select_one("a.base-card__full-link")
if title_el and company_el:
jobs.append(JobListing(
title=title_el.get_text(strip=True),
company=company_el.get_text(strip=True),
location=location_el.get_text(strip=True) if location_el else "",
salary=None,
url=link_el.get("href", "") if link_el else "",
source="linkedin"
))
return jobs
The Aggregator
Combine all scrapers into a unified interface:
class JobAggregator:
def __init__(self, api_key):
self.scrapers = {
"indeed": IndeedScraper(api_key),
"linkedin": LinkedInScraper(api_key),
}
def search_all(self, query, location, pages=1):
all_jobs = []
for name, scraper in self.scrapers.items():
print(f"Searching {name}...")
try:
jobs = scraper.search(query, location, pages)
all_jobs.extend(jobs)
print(f" Found {len(jobs)} jobs")
except Exception as e:
print(f" Error: {e}")
# Deduplicate by title + company
seen = set()
unique = []
for job in all_jobs:
key = (job.title.lower(), job.company.lower())
if key not in seen:
seen.add(key)
unique.append(job)
return unique
def export_json(self, jobs, filename):
with open(filename, "w") as f:
json.dump([asdict(j) for j in jobs], f, indent=2)
print(f"Exported {len(jobs)} jobs to {filename}")
# Usage
agg = JobAggregator(api_key="YOUR_SCRAPERAPI_KEY")
jobs = agg.search_all("python developer", "San Francisco", pages=3)
agg.export_json(jobs, "sf_python_jobs.json")
Handling Anti-Bot Protection
Job boards use aggressive bot detection:
- ScraperAPI handles CAPTCHAs and IP rotation automatically
- ThorData residential proxies help avoid IP blocks
- Rate limiting is essential — never hit more than 1 request per 2-3 seconds
Monitoring
Track your scraper health with ScrapeOps. Job boards frequently change their HTML structure, and ScrapeOps alerts you when success rates drop.
Taking It Further
- Add email alerts for new postings matching criteria
- Build a simple web dashboard with Flask/FastAPI
- Track salary trends over time
- Add filters for remote-only, seniority level, etc.
- Store in PostgreSQL for advanced querying
Conclusion
A job board aggregator is a practical project with real users. Whether for personal job searching or building a commercial product, the ability to unify job data across platforms creates significant value.
Top comments (0)