Indeed hosts millions of job listings worldwide, making it a goldmine for job market analysis, salary research, and recruitment automation. This guide shows you how to extract job listings from Indeed using Python, handle pagination, and clean the data for analysis.
What Data Can You Extract?
From Indeed job listings, you can collect:
- Job titles and company names
- Salary ranges (when posted)
- Job descriptions and requirements
- Location and remote/hybrid indicators
- Posting dates and application links
- Company ratings and review counts
Setting Up the Scraper
import requests
from bs4 import BeautifulSoup
import json
import time
import csv
import random
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class JobListing:
title: str
company: str
location: str
salary: Optional[str]
summary: str
date_posted: str
job_url: str
source: str = "indeed"
def create_session():
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
})
return session
Scraping Job Listings
def scrape_indeed_page(session, query, location, start=0):
"""Scrape a single page of Indeed results."""
params = {
"q": query,
"l": location,
"start": start,
"fromage": 7, # Last 7 days
}
url = "https://www.indeed.com/jobs"
response = session.get(url, params=params)
if response.status_code != 200:
print(f"Error: HTTP {response.status_code}")
return []
soup = BeautifulSoup(response.text, "html.parser")
jobs = []
# Find job cards
job_cards = soup.select("div.job_seen_beacon")
for card in job_cards:
try:
# Title
title_el = card.select_one("h2.jobTitle a")
title = title_el.get_text(strip=True) if title_el else "N/A"
job_link = title_el.get("href", "") if title_el else ""
# Company
company_el = card.select_one("span[data-testid='company-name']")
company = company_el.get_text(strip=True) if company_el else "N/A"
# Location
loc_el = card.select_one("div[data-testid='text-location']")
loc = loc_el.get_text(strip=True) if loc_el else "N/A"
# Salary
salary_el = card.select_one("div.salary-snippet-container")
salary = salary_el.get_text(strip=True) if salary_el else None
# Summary
summary_el = card.select_one("div.job-snippet")
summary = summary_el.get_text(strip=True) if summary_el else ""
# Date
date_el = card.select_one("span.date")
date_posted = date_el.get_text(strip=True) if date_el else "N/A"
full_url = f"https://www.indeed.com{job_link}" if job_link.startswith("/") else job_link
jobs.append(JobListing(
title=title,
company=company,
location=loc,
salary=salary,
summary=summary,
date_posted=date_posted,
job_url=full_url,
))
except Exception as e:
print(f"Error parsing job card: {e}")
continue
return jobs
Handling Pagination
def scrape_indeed_jobs(query, location, max_pages=5):
"""Scrape multiple pages of Indeed results."""
session = create_session()
all_jobs = []
seen_urls = set()
for page in range(max_pages):
start = page * 10
print(f"Scraping page {page + 1} (start={start})...")
jobs = scrape_indeed_page(session, query, location, start)
if not jobs:
print("No more results found.")
break
# Deduplicate
new_jobs = []
for job in jobs:
if job.job_url not in seen_urls:
seen_urls.add(job.job_url)
new_jobs.append(job)
all_jobs.extend(new_jobs)
print(f" Found {len(new_jobs)} new jobs ({len(all_jobs)} total)")
# Random delay between pages
delay = random.uniform(3, 7)
time.sleep(delay)
return all_jobs
Deduplication Strategy
Job boards often show the same listing multiple times. Robust deduplication is critical:
from difflib import SequenceMatcher
def deduplicate_jobs(jobs, similarity_threshold=0.85):
"""Remove near-duplicate job listings."""
unique = []
for job in jobs:
is_duplicate = False
job_key = f"{job.title} {job.company} {job.location}"
for existing in unique:
existing_key = f"{existing.title} {existing.company} {existing.location}"
similarity = SequenceMatcher(None, job_key, existing_key).ratio()
if similarity >= similarity_threshold:
is_duplicate = True
break
if not is_duplicate:
unique.append(job)
print(f"Deduplicated: {len(jobs)} -> {len(unique)} listings")
return unique
Exporting Results
def export_to_csv(jobs, filename="indeed_jobs.csv"):
if not jobs:
print("No jobs to export.")
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=asdict(jobs[0]).keys())
writer.writeheader()
for job in jobs:
writer.writerow(asdict(job))
print(f"Exported {len(jobs)} jobs to {filename}")
def export_to_json(jobs, filename="indeed_jobs.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump([asdict(j) for j in jobs], f, indent=2, ensure_ascii=False)
print(f"Exported {len(jobs)} jobs to {filename}")
Running the Full Pipeline
if __name__ == "__main__":
jobs = scrape_indeed_jobs(
query="python developer",
location="Remote",
max_pages=5
)
jobs = deduplicate_jobs(jobs)
export_to_csv(jobs)
export_to_json(jobs)
# Quick stats
salary_jobs = [j for j in jobs if j.salary]
print(f"\nTotal unique jobs: {len(jobs)}")
print(f"Jobs with salary info: {len(salary_jobs)}")
Alternative: Pre-Built Job Scrapers
Building and maintaining a job scraper is time-consuming. For production use, consider tools like the LinkedIn Jobs Scraper on Apify which handles anti-bot measures, pagination, and data normalization out of the box.
Handling Rate Limiting
Indeed aggressively rate-limits scrapers. For reliable data collection, use a proxy service like ScrapeOps which provides rotating proxies and header management tailored for job board scraping.
Legal Considerations
- Indeed's Terms of Service prohibit automated scraping
- The hiQ vs LinkedIn Supreme Court case (2022) provides some legal precedent for scraping public data
- Always consult legal counsel before scraping at scale
- Consider using official APIs or data partnerships when available
Conclusion
Indeed job scraping with Python is straightforward technically but requires careful handling of rate limits, deduplication, and anti-bot measures. Start small, respect rate limits, and consider your legal obligations before scaling up.
Top comments (0)