Indeed is the largest job board in the world with over 350 million unique visitors per month. Whether you're building a job aggregator, tracking salary trends, or doing labor market research, Indeed's data is incredibly valuable.
But Indeed doesn't offer a public API. And their anti-bot systems are among the most aggressive I've tested.
In this guide, I'll show you how to scrape Indeed job listings, salary data, and company reviews using Python — and how to handle the anti-bot measures that will try to stop you.
What Data Can You Extract from Indeed?
Indeed has three main data types worth scraping:
- Job listings: Title, company, location, salary range, job type (full-time/part-time/contract), posted date, job description
- Salary data: Average salaries by job title and location, salary ranges, pay transparency info
- Company reviews: Overall rating, work-life balance, compensation, management, culture scores, review text
The Anti-Bot Problem
Indeed uses multiple layers of protection:
- Cloudflare WAF — blocks suspicious request patterns
- JavaScript challenges — requires browser-like JS execution
- Rate limiting — aggressive throttling after a few dozen requests
- CAPTCHA walls — triggered by unusual traffic patterns
- Session fingerprinting — tracks browser characteristics across requests
A simple requests.get() will get you blocked within 5-10 requests. You need a real strategy.
Method 1: Basic Scraping with requests + BeautifulSoup
This works for small-scale scraping (under 50 pages). You'll need proper headers and delays.
import requests
from bs4 import BeautifulSoup
import time
import random
import json
def scrape_indeed_jobs(query: str, location: str, pages: int = 3) -> list[dict]:
"""Scrape Indeed job listings for a given query and location."""
jobs = []
base_url = "https://www.indeed.com/jobs"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.indeed.com/",
"DNT": "1",
}
session = requests.Session()
session.headers.update(headers)
# Visit homepage first to get cookies
session.get("https://www.indeed.com", timeout=10)
time.sleep(random.uniform(2, 4))
for page in range(pages):
params = {
"q": query,
"l": location,
"start": page * 10,
"sort": "date",
}
try:
response = session.get(base_url, params=params, timeout=15)
response.raise_for_status()
except requests.RequestException as e:
print(f"Request failed on page {page}: {e}")
break
soup = BeautifulSoup(response.text, "html.parser")
# Indeed embeds job data in a script tag as JSON
script_tag = soup.find("script", {"id": "mosaic-data"})
if script_tag:
jobs.extend(parse_mosaic_data(script_tag.string))
else:
# Fallback: parse HTML directly
job_cards = soup.select("div.job_seen_beacon")
for card in job_cards:
job = extract_job_from_card(card)
if job:
jobs.append(job)
# Random delay between pages (critical for avoiding blocks)
time.sleep(random.uniform(3, 7))
return jobs
def extract_job_from_card(card) -> dict | None:
"""Extract job data from an Indeed job card HTML element."""
title_elem = card.select_one("h2.jobTitle a")
company_elem = card.select_one('span[data-testid="company-name"]')
location_elem = card.select_one('div[data-testid="text-location"]')
salary_elem = card.select_one("div.salary-snippet-container")
snippet_elem = card.select_one("div.job-snippet")
date_elem = card.select_one("span.date")
job_type_elem = card.select_one('div[data-testid="attribute_snippet_testid"]')
if not title_elem:
return None
return {
"title": title_elem.get_text(strip=True),
"company": company_elem.get_text(strip=True) if company_elem else None,
"location": location_elem.get_text(strip=True) if location_elem else None,
"salary": salary_elem.get_text(strip=True) if salary_elem else None,
"job_type": job_type_elem.get_text(strip=True) if job_type_elem else None,
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else None,
"posted": date_elem.get_text(strip=True) if date_elem else None,
"url": "https://www.indeed.com" + title_elem["href"] if title_elem.get("href") else None,
}
def parse_mosaic_data(script_content: str) -> list[dict]:
"""Parse the mosaic-data JSON that Indeed embeds in search pages."""
jobs = []
try:
data = json.loads(script_content)
results = (
data.get("mosaic", {})
.get("providerData", {})
.get("jobListing", {})
.get("results", [])
)
for result in results:
jobs.append({
"title": result.get("title"),
"company": result.get("company"),
"location": result.get("formattedLocation"),
"salary": result.get("salarySnippet", {}).get("text"),
"job_type": result.get("jobTypes", [None])[0] if result.get("jobTypes") else None,
"snippet": result.get("snippet"),
"posted": result.get("formattedRelativeTime"),
"url": f"https://www.indeed.com/viewjob?jk={result.get('jobkey')}",
})
except (json.JSONDecodeError, KeyError):
pass
return jobs
# Usage
if __name__ == "__main__":
results = scrape_indeed_jobs("python developer", "New York, NY", pages=3)
print(f"Found {len(results)} jobs")
for job in results[:5]:
print(f" {job['title']} at {job['company']} — {job['salary'] or 'No salary listed'}")
Method 2: Scraping Salary Pages
Indeed has dedicated salary pages at indeed.com/career/{job-title}/salaries. These are less protected than job search:
def scrape_indeed_salaries(job_title: str, location: str = "") -> dict:
"""Scrape salary data for a specific job title from Indeed."""
slug = job_title.lower().replace(" ", "-")
url = f"https://www.indeed.com/career/{slug}/salaries"
if location:
url += f"/{location.lower().replace(' ', '-').replace(',', '')}".replace("--", "-")
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
}
response = requests.get(url, headers=headers, timeout=15)
soup = BeautifulSoup(response.text, "html.parser")
salary_data = {"job_title": job_title, "location": location}
# Extract average salary
avg_elem = soup.select_one('div[data-testid="avg-salary-value"]')
if avg_elem:
salary_data["average_salary"] = avg_elem.get_text(strip=True)
# Extract salary range
range_elem = soup.select_one('div[data-testid="salary-range"]')
if range_elem:
salary_data["salary_range"] = range_elem.get_text(strip=True)
# Extract top-paying companies
company_elems = soup.select('a[data-testid="top-company-salary"] span')
salary_data["top_companies"] = [el.get_text(strip=True) for el in company_elems[:5]]
return salary_data
# Usage
salary = scrape_indeed_salaries("software engineer", "San Francisco CA")
print(f"Average: {salary.get('average_salary', 'N/A')}")
print(f"Range: {salary.get('salary_range', 'N/A')}")
Method 3: Scraping Company Reviews
Company reviews live at indeed.com/cmp/{company-name}/reviews:
def scrape_indeed_reviews(company_slug: str, pages: int = 3) -> list[dict]:
"""Scrape company reviews from Indeed."""
reviews = []
base_url = f"https://www.indeed.com/cmp/{company_slug}/reviews"
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
})
for page in range(1, pages + 1):
params = {"start": (page - 1) * 20}
response = session.get(base_url, params=params, timeout=15)
soup = BeautifulSoup(response.text, "html.parser")
review_cards = soup.select('div[data-testid="review-card"]')
for card in review_cards:
title = card.select_one('h2[data-testid="review-title"]')
rating = card.select_one('button[aria-label*="out of 5 stars"]')
body = card.select_one('span[data-testid="review-text"]')
author = card.select_one('span[data-testid="review-author"]')
date = card.select_one('span[data-testid="review-date"]')
reviews.append({
"title": title.get_text(strip=True) if title else None,
"rating": rating["aria-label"].split()[0] if rating else None,
"text": body.get_text(strip=True) if body else None,
"author": author.get_text(strip=True) if author else None,
"date": date.get_text(strip=True) if date else None,
})
time.sleep(random.uniform(3, 6))
return reviews
Scaling Up: When Basic Scraping Isn't Enough
The code above works for small projects — maybe 100-200 pages per day. But if you need to track thousands of job listings across multiple cities, you'll hit Indeed's anti-bot walls fast.
The main problems at scale:
- IP bans after 50-100 requests from the same IP
- CAPTCHA challenges that break automated flows
-
JavaScript rendering that
requestscan't handle - Session invalidation that kills your cookies mid-scrape
Using ScraperAPI for Scale
ScraperAPI handles the hard parts — proxy rotation, CAPTCHA solving, browser fingerprinting — so you can focus on parsing the data:
SCRAPERAPI_KEY = "your_api_key_here"
def scrape_with_scraperapi(url: str) -> str:
"""Fetch a page through ScraperAPI with anti-bot bypass."""
params = {
"api_key": SCRAPERAPI_KEY,
"url": url,
"render": "true", # JavaScript rendering
"country_code": "us", # US residential IPs
"premium": "true", # Premium proxy pool (needed for Indeed)
}
response = requests.get("http://api.scraperapi.com", params=params, timeout=60)
return response.text
# Use it in your scraping function
html = scrape_with_scraperapi(
"https://www.indeed.com/jobs?q=data+engineer&l=Austin%2C+TX"
)
soup = BeautifulSoup(html, "html.parser")
# ... parse as before
ScraperAPI automatically rotates through millions of residential IPs, solves CAPTCHAs, and handles JavaScript rendering. You pay per successful request instead of managing proxy infrastructure yourself.
Storing and Analyzing Job Data
Once you have the data, store it properly for analysis:
import csv
from datetime import datetime
def save_jobs_to_csv(jobs: list[dict], filename: str = "indeed_jobs.csv"):
"""Save scraped jobs to CSV with timestamp."""
fieldnames = [
"title", "company", "location", "salary",
"job_type", "snippet", "posted", "url", "scraped_at",
]
with open(filename, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
if f.tell() == 0:
writer.writeheader()
for job in jobs:
job["scraped_at"] = datetime.now().isoformat()
writer.writerow(job)
print(f"Saved {len(jobs)} jobs to {filename}")
Key Takeaways
-
Start with the embedded JSON — Indeed's
mosaic-datascript tag is more reliable than parsing HTML - Rotate user agents and add delays — minimum 3-5 seconds between requests
- Use sessions — visit the homepage first to collect cookies before searching
- Salary and review pages are less protected than job search — start there if you need that data
- For anything over 200 pages/day, use a proxy service like ScraperAPI to handle anti-bot detection
Related Tools
If you're doing job market research at scale, check out the LinkedIn Jobs Scraper on Apify — it handles LinkedIn's anti-bot measures and returns structured data you can combine with your Indeed scrapes.
Need to scrape other job boards? I'm building more scrapers on my Apify profile. Follow for updates.
Top comments (0)