agenthustler

Posted on Mar 26

Indeed Job Scraping: Extract Job Listings with Python in 2026

#webdev #python #webscraping #tutorial

Indeed hosts millions of job listings worldwide, making it a goldmine for job market analysis, salary research, and recruitment automation. This guide shows you how to extract job listings from Indeed using Python, handle pagination, and clean the data for analysis.

What Data Can You Extract?

From Indeed job listings, you can collect:

Job titles and company names
Salary ranges (when posted)
Job descriptions and requirements
Location and remote/hybrid indicators
Posting dates and application links
Company ratings and review counts

Setting Up the Scraper

import requests
from bs4 import BeautifulSoup
import json
import time
import csv
import random
from dataclasses import dataclass, asdict
from typing import Optional

@dataclass
class JobListing:
    title: str
    company: str
    location: str
    salary: Optional[str]
    summary: str
    date_posted: str
    job_url: str
    source: str = "indeed"

def create_session():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml",
    })
    return session

Scraping Job Listings

def scrape_indeed_page(session, query, location, start=0):
    """Scrape a single page of Indeed results."""
    params = {
        "q": query,
        "l": location,
        "start": start,
        "fromage": 7,  # Last 7 days
    }

    url = "https://www.indeed.com/jobs"
    response = session.get(url, params=params)

    if response.status_code != 200:
        print(f"Error: HTTP {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    jobs = []

    # Find job cards
    job_cards = soup.select("div.job_seen_beacon")

    for card in job_cards:
        try:
            # Title
            title_el = card.select_one("h2.jobTitle a")
            title = title_el.get_text(strip=True) if title_el else "N/A"
            job_link = title_el.get("href", "") if title_el else ""

            # Company
            company_el = card.select_one("span[data-testid='company-name']")
            company = company_el.get_text(strip=True) if company_el else "N/A"

            # Location
            loc_el = card.select_one("div[data-testid='text-location']")
            loc = loc_el.get_text(strip=True) if loc_el else "N/A"

            # Salary
            salary_el = card.select_one("div.salary-snippet-container")
            salary = salary_el.get_text(strip=True) if salary_el else None

            # Summary
            summary_el = card.select_one("div.job-snippet")
            summary = summary_el.get_text(strip=True) if summary_el else ""

            # Date
            date_el = card.select_one("span.date")
            date_posted = date_el.get_text(strip=True) if date_el else "N/A"

            full_url = f"https://www.indeed.com{job_link}" if job_link.startswith("/") else job_link

            jobs.append(JobListing(
                title=title,
                company=company,
                location=loc,
                salary=salary,
                summary=summary,
                date_posted=date_posted,
                job_url=full_url,
            ))
        except Exception as e:
            print(f"Error parsing job card: {e}")
            continue

    return jobs

Handling Pagination

def scrape_indeed_jobs(query, location, max_pages=5):
    """Scrape multiple pages of Indeed results."""
    session = create_session()
    all_jobs = []
    seen_urls = set()

    for page in range(max_pages):
        start = page * 10
        print(f"Scraping page {page + 1} (start={start})...")

        jobs = scrape_indeed_page(session, query, location, start)

        if not jobs:
            print("No more results found.")
            break

        # Deduplicate
        new_jobs = []
        for job in jobs:
            if job.job_url not in seen_urls:
                seen_urls.add(job.job_url)
                new_jobs.append(job)

        all_jobs.extend(new_jobs)
        print(f"  Found {len(new_jobs)} new jobs ({len(all_jobs)} total)")

        # Random delay between pages
        delay = random.uniform(3, 7)
        time.sleep(delay)

    return all_jobs

Deduplication Strategy

Job boards often show the same listing multiple times. Robust deduplication is critical:

from difflib import SequenceMatcher

def deduplicate_jobs(jobs, similarity_threshold=0.85):
    """Remove near-duplicate job listings."""
    unique = []

    for job in jobs:
        is_duplicate = False
        job_key = f"{job.title} {job.company} {job.location}"

        for existing in unique:
            existing_key = f"{existing.title} {existing.company} {existing.location}"
            similarity = SequenceMatcher(None, job_key, existing_key).ratio()

            if similarity >= similarity_threshold:
                is_duplicate = True
                break

        if not is_duplicate:
            unique.append(job)

    print(f"Deduplicated: {len(jobs)} -> {len(unique)} listings")
    return unique

Exporting Results

def export_to_csv(jobs, filename="indeed_jobs.csv"):
    if not jobs:
        print("No jobs to export.")
        return

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=asdict(jobs[0]).keys())
        writer.writeheader()
        for job in jobs:
            writer.writerow(asdict(job))

    print(f"Exported {len(jobs)} jobs to {filename}")

def export_to_json(jobs, filename="indeed_jobs.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump([asdict(j) for j in jobs], f, indent=2, ensure_ascii=False)
    print(f"Exported {len(jobs)} jobs to {filename}")

Running the Full Pipeline

if __name__ == "__main__":
    jobs = scrape_indeed_jobs(
        query="python developer",
        location="Remote",
        max_pages=5
    )

    jobs = deduplicate_jobs(jobs)

    export_to_csv(jobs)
    export_to_json(jobs)

    # Quick stats
    salary_jobs = [j for j in jobs if j.salary]
    print(f"\nTotal unique jobs: {len(jobs)}")
    print(f"Jobs with salary info: {len(salary_jobs)}")

Alternative: Pre-Built Job Scrapers

Building and maintaining a job scraper is time-consuming. For production use, consider tools like the LinkedIn Jobs Scraper on Apify which handles anti-bot measures, pagination, and data normalization out of the box.

Handling Rate Limiting

Indeed aggressively rate-limits scrapers. For reliable data collection, use a proxy service like ScrapeOps which provides rotating proxies and header management tailored for job board scraping.

Legal Considerations

Indeed's Terms of Service prohibit automated scraping
The hiQ vs LinkedIn Supreme Court case (2022) provides some legal precedent for scraping public data
Always consult legal counsel before scraping at scale
Consider using official APIs or data partnerships when available

Conclusion

Indeed job scraping with Python is straightforward technically but requires careful handling of rate limits, deduplication, and anti-bot measures. Start small, respect rate limits, and consider your legal obligations before scaling up.

DEV Community