Indeed is the largest job aggregator with over 300 million unique visitors per month. Whether you are building a job board aggregator, analyzing salary trends, or monitoring hiring activity in specific industries, Indeed data is incredibly valuable.
This guide shows you how to extract job listings, salary data, and company reviews from Indeed in 2026 — with working code, honest limitations, and practical workarounds.
What Data Can You Get from Indeed?
Indeed exposes several types of data:
- Job listings: title, company, location, salary (when posted), description, date posted
- Salary data: reported salaries by job title and location
- Company reviews: ratings, pros/cons, CEO approval
- Job trends: hiring velocity by industry and region
The most common use case is job listing aggregation — pulling thousands of postings for a specific role or location and analyzing salary ranges, required skills, or remote work availability.
Method 1: Scraping Indeed Search Results with Requests
Indeed search results load server-side, which means you can get listing data with plain HTTP requests — no browser automation needed for the initial results.
import requests
from bs4 import BeautifulSoup
import time
import re
def scrape_indeed_jobs(query: str, location: str, num_pages: int = 5) -> list:
"""Scrape Indeed job listings for a given query and location."""
base_url = "https://www.indeed.com/jobs"
all_jobs = []
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
}
for page in range(num_pages):
params = {
"q": query,
"l": location,
"start": page * 10, # Indeed uses 10 results per page
"sort": "date"
}
resp = requests.get(base_url, params=params, headers=headers)
if resp.status_code != 200:
print(f"Page {page} failed with status {resp.status_code}")
break
soup = BeautifulSoup(resp.text, "html.parser")
# Indeed wraps each job in a div with data-jk attribute (job key)
job_cards = soup.find_all("div", attrs={"data-jk": True})
for card in job_cards:
job = {}
# Job title
title_el = card.find("h2", class_=re.compile("jobTitle"))
if title_el:
link = title_el.find("a")
job["title"] = (
link.get_text(strip=True) if link
else title_el.get_text(strip=True)
)
job["url"] = (
"https://www.indeed.com" + link["href"]
if link and link.get("href") else None
)
# Company name
company_el = card.find(
"span", attrs={"data-testid": "company-name"}
)
job["company"] = (
company_el.get_text(strip=True) if company_el else None
)
# Location
location_el = card.find(
"div", attrs={"data-testid": "text-location"}
)
job["location"] = (
location_el.get_text(strip=True) if location_el else None
)
# Salary (not always present)
salary_el = card.find("div", class_=re.compile("salary"))
job["salary"] = (
salary_el.get_text(strip=True) if salary_el else None
)
# Job snippet / description preview
snippet_el = card.find("div", class_=re.compile("job-snippet"))
job["snippet"] = (
snippet_el.get_text(strip=True) if snippet_el else None
)
# Date posted
date_el = card.find("span", class_=re.compile("date"))
job["posted"] = (
date_el.get_text(strip=True) if date_el else None
)
# Job key for deduplication
job["job_key"] = card.get("data-jk")
all_jobs.append(job)
print(f"Page {page + 1}: found {len(job_cards)} jobs")
time.sleep(2) # Respect rate limits
return all_jobs
# Example usage
jobs = scrape_indeed_jobs("python developer", "New York, NY", num_pages=3)
for job in jobs[:5]:
print(f"{job['title']} at {job['company']}")
print(f" Location: {job['location']}")
print(f" Salary: {job['salary'] or 'Not listed'}")
print(f" Posted: {job['posted']}")
print()
Getting Full Job Descriptions
The search results only show a snippet. To get the full job description, you need to fetch each individual job page:
def get_full_job_description(job_url: str) -> dict:
"""Fetch the complete job description from an Indeed job page."""
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36"
)
}
resp = requests.get(job_url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")
result = {}
# Full description
desc_el = soup.find("div", id="jobDescriptionText")
if desc_el:
result["description"] = desc_el.get_text(
separator="\n", strip=True
)
# Job details (type, schedule, benefits)
details = soup.find_all(
"div", attrs={"data-testid": re.compile("jobDetail")}
)
result["details"] = [d.get_text(strip=True) for d in details]
# Salary details (sometimes more specific on the detail page)
salary_section = soup.find("div", id="salaryInfoAndJobType")
if salary_section:
result["salary_detail"] = salary_section.get_text(strip=True)
return result
# Fetch full descriptions for top results
for job in jobs[:3]:
if job.get("url"):
details = get_full_job_description(job["url"])
print(f"\n=== {job['title']} at {job['company']} ===")
desc = details.get("description", "")
print(desc[:500] + "..." if len(desc) > 500 else desc)
time.sleep(2)
Method 2: Playwright for Dynamic Content
Indeed has been progressively moving to client-side rendering for some features. If the requests approach starts returning incomplete data, Playwright is your fallback:
import asyncio
from playwright.async_api import async_playwright
async def scrape_indeed_with_browser(
query: str, location: str, max_pages: int = 3
) -> list:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
)
page = await context.new_page()
all_jobs = []
for page_num in range(max_pages):
start = page_num * 10
url = (
f"https://www.indeed.com/jobs"
f"?q={query}&l={location}&start={start}"
)
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(2000)
jobs = await page.evaluate("""
() => {
const cards = document.querySelectorAll('[data-jk]');
return Array.from(cards).map(card => {
const titleEl = card.querySelector('h2 a, h2 span');
const companyEl = card.querySelector(
'[data-testid="company-name"]'
);
const locationEl = card.querySelector(
'[data-testid="text-location"]'
);
const salaryEl = card.querySelector(
'[class*="salary"]'
);
return {
title: titleEl?.textContent?.trim() || null,
company: companyEl?.textContent?.trim() || null,
location: locationEl?.textContent?.trim() || null,
salary: salaryEl?.textContent?.trim() || null,
job_key: card.getAttribute('data-jk')
};
});
}
""")
all_jobs.extend(jobs)
print(f"Page {page_num + 1}: {len(jobs)} jobs")
await asyncio.sleep(3)
await browser.close()
return all_jobs
jobs = asyncio.run(
scrape_indeed_with_browser("data engineer", "Remote")
)
Scraping Indeed Salary Data
Indeed has a dedicated salary section that aggregates reported salaries by job title and location. This data is useful for market research:
def scrape_indeed_salaries(job_title: str, location: str = "") -> dict:
"""Scrape salary data for a specific job title from Indeed."""
slug = job_title.lower().replace(" ", "-")
url = f"https://www.indeed.com/career/{slug}/salaries"
if location:
loc_slug = location.lower().replace(", ", "-").replace(" ", "-")
url += f"/{loc_slug}"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36"
)
}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")
salary_data = {"job_title": job_title, "location": location}
# Average salary
avg_el = soup.find("div", attrs={"data-testid": "avg-salary"})
if avg_el:
salary_data["average_salary"] = avg_el.get_text(strip=True)
# Salary range
range_els = soup.find_all(
"div", attrs={"data-testid": re.compile("salary-range")}
)
if range_els:
salary_data["salary_range"] = [
el.get_text(strip=True) for el in range_els
]
# Top paying companies
companies = soup.find_all(
"div", attrs={"data-testid": "company-salary"}
)
salary_data["top_companies"] = []
for comp in companies[:10]:
name_el = comp.find("a")
pay_el = comp.find("span", class_=re.compile("salary"))
if name_el and pay_el:
salary_data["top_companies"].append({
"company": name_el.get_text(strip=True),
"avg_salary": pay_el.get_text(strip=True)
})
return salary_data
# Example: check Python developer salaries in San Francisco
salaries = scrape_indeed_salaries("Python Developer", "San Francisco, CA")
print(f"Average: {salaries.get('average_salary', 'N/A')}")
for comp in salaries.get("top_companies", [])[:5]:
print(f" {comp['company']}: {comp['avg_salary']}")
Scraping Company Reviews
Indeed company review pages contain ratings, written reviews, and pros/cons:
def scrape_company_reviews(company_slug: str, num_pages: int = 3) -> list:
"""Scrape Indeed company reviews."""
all_reviews = []
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36"
)
}
for page_num in range(num_pages):
start = page_num * 20
url = (
f"https://www.indeed.com/cmp/{company_slug}"
f"/reviews?start={start}"
)
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")
reviews = soup.find_all(
"div", attrs={"data-testid": "review-card"}
)
for review in reviews:
r = {}
title_el = review.find("h2")
r["title"] = (
title_el.get_text(strip=True) if title_el else None
)
rating_el = review.find(
"button", attrs={"aria-label": re.compile("stars")}
)
if rating_el:
r["rating"] = rating_el.get("aria-label", "")
pros_el = review.find(
"div", attrs={"data-testid": "pros-text"}
)
r["pros"] = (
pros_el.get_text(strip=True) if pros_el else None
)
cons_el = review.find(
"div", attrs={"data-testid": "cons-text"}
)
r["cons"] = (
cons_el.get_text(strip=True) if cons_el else None
)
all_reviews.append(r)
time.sleep(2)
return all_reviews
# Scrape Google reviews from Indeed
reviews = scrape_company_reviews("Google")
for r in reviews[:3]:
print(f"[{r.get('rating', '?')}] {r.get('title', 'No title')}")
print(f" Pros: {r.get('pros', 'N/A')}")
print(f" Cons: {r.get('cons', 'N/A')}")
print()
Handling Anti-Bot Protection
Indeed bot detection is moderate compared to sites like LinkedIn or Zillow. Here is what you will face:
- Rate limiting: More than ~1 request per second from the same IP will trigger CAPTCHAs
- Cookie requirements: Sessions without proper cookies get flagged
- CAPTCHA walls: Automated requests eventually hit a CAPTCHA page
- IP blocking: Persistent scraping from datacenter IPs gets blocked within minutes
Solutions That Work
For small-scale scraping (under 500 pages): Add delays (2-3 seconds between requests), rotate User-Agent strings, and use a residential proxy.
For production-scale scraping: Use a scraping API that handles anti-bot automatically. ScraperAPI works well for Indeed — their proxy pool and header rotation handles the CAPTCHA problem:
SCRAPERAPI_KEY = "YOUR_KEY"
def scrape_indeed_via_api(query: str, location: str) -> str:
"""Use ScraperAPI to scrape Indeed without getting blocked."""
target_url = (
f"https://www.indeed.com/jobs?q={query}&l={location}"
)
api_url = (
f"http://api.scraperapi.com"
f"?api_key={SCRAPERAPI_KEY}&url={target_url}"
)
resp = requests.get(api_url, timeout=60)
return resp.text
For comparing scraping API options (pricing, success rates, speed), ScrapeOps maintains a helpful proxy comparison dashboard that benchmarks different providers against popular targets including Indeed.
Exporting to Structured Data
Here is a complete pipeline that scrapes, cleans, and exports Indeed data:
import csv
import json
from datetime import datetime
def export_jobs(jobs: list, fmt: str = "csv"):
"""Export scraped jobs to CSV or JSON."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Clean the data
clean_jobs = []
seen_keys = set()
for job in jobs:
key = job.get("job_key")
if key and key in seen_keys:
continue
if key:
seen_keys.add(key)
salary = job.get("salary", "")
if salary:
salary = salary.replace("\xa0", " ").strip()
clean_jobs.append({
"title": job.get("title", "").strip(),
"company": job.get("company", "").strip(),
"location": job.get("location", "").strip(),
"salary": salary or "Not disclosed",
"posted": job.get("posted", ""),
"job_key": key,
"scraped_at": datetime.now().isoformat()
})
if fmt == "csv":
filename = f"indeed_jobs_{timestamp}.csv"
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=clean_jobs[0].keys())
writer.writeheader()
writer.writerows(clean_jobs)
else:
filename = f"indeed_jobs_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(clean_jobs, f, indent=2, ensure_ascii=False)
print(f"Exported {len(clean_jobs)} jobs to {filename}")
return filename
export_jobs(jobs, fmt="csv")
Legal and Ethical Notes
Indeed robots.txt disallows most scraping paths. Their ToS explicitly prohibits automated data collection. That said:
- Public job listings are widely considered factual, non-copyrightable data
- The hiQ v. LinkedIn precedent suggests that scraping public data is not a CFAA violation
- However, commercial redistribution of Indeed data could trigger a civil lawsuit
- Indeed actively sends cease-and-desist letters to scrapers that redistribute their data
Best practices: Scrape at respectful rates, do not redistribute raw data, use the data for analysis and insights rather than building a competing job board, and consider whether Indeed official APIs or partner programs might serve your needs.
The Easy Path: Pre-Built Job Scrapers
If you need job data without maintaining scraping infrastructure, the Apify Store has ready-to-use job scrapers including our Glassdoor Jobs and Reviews Scraper for a complementary data source. These handle proxy rotation, anti-bot bypasses, and structured output — you just define your search parameters and get clean JSON or CSV output.
For job market analysis, combining data from multiple sources (Indeed + Glassdoor + LinkedIn) gives you the most complete picture of salary ranges and hiring trends.
Building something cool with job data? Drop a comment below or check out our other scrapers on the Apify Store.
Top comments (0)