Not all job postings are created equal. Here's how to scrape job boards and automatically score posting quality to identify red flags.
Why Analyze Job Posting Quality?
Job seekers waste hours on poorly written listings that signal unrealistic expectations, toxic culture, ghost jobs, or salary manipulation. Automating this analysis saves applicants time.
Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
SCRAPER_KEY = "YOUR_SCRAPERAPI_KEY"
def fetch(url):
resp = requests.get(
"http://api.scraperapi.com",
params={"api_key": SCRAPER_KEY, "url": url, "render": "true"},
timeout=30
)
return BeautifulSoup(resp.text, "html.parser")
Scraping Job Listings
def scrape_indeed_jobs(query, location, pages=5):
jobs = []
for page in range(pages):
start = page * 10
url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={start}"
soup = fetch(url)
cards = soup.find_all("div", class_="job_seen_beacon")
for card in cards:
title_el = card.find("h2", class_="jobTitle")
company_el = card.find("span", attrs={"data-testid": "company-name"})
snippet_el = card.find("div", class_="job-snippet")
salary_el = card.find("div", class_="salary-snippet-container")
jobs.append({
"title": title_el.text.strip() if title_el else "",
"company": company_el.text.strip() if company_el else "",
"snippet": snippet_el.text.strip() if snippet_el else "",
"salary": salary_el.text.strip() if salary_el else "Not listed",
})
time.sleep(random.uniform(2, 5))
return pd.DataFrame(jobs)
Red Flag Detection Engine
RED_FLAGS = {
"unrealistic_requirements": {
"patterns": [r"(\d{2}\+?)\s*years?.*(experience|exp)"],
"weight": 3
},
"toxic_culture_signals": {
"keywords": [
"rockstar", "ninja", "guru", "unicorn",
"fast-paced environment", "wear many hats",
"we work hard and play hard", "like a family",
"unlimited PTO", "competitive salary"
],
"weight": 2
},
"salary_red_flags": {
"conditions": ["salary_missing", "range_too_wide"],
"weight": 2
},
"ghost_job_signals": {
"keywords": ["ongoing recruitment", "talent pool", "future opportunities"],
"weight": 3
}
}
def score_job_posting(job_text, salary_info):
flags = []
total_score = 0
text_lower = job_text.lower()
for flag_name, config in RED_FLAGS.items():
flag_hits = 0
for pattern in config.get("patterns", []):
flag_hits += len(re.findall(pattern, text_lower))
for keyword in config.get("keywords", []):
if keyword.lower() in text_lower:
flag_hits += 1
for condition in config.get("conditions", []):
if condition == "salary_missing" and salary_info == "Not listed":
flag_hits += 1
if flag_hits > 0:
flags.append(flag_name)
total_score += flag_hits * config["weight"]
return {"quality_score": max(0, 100 - total_score * 5), "flags": flags}
Analyzing Job Markets
def analyze_job_market(query, location):
jobs_df = scrape_indeed_jobs(query, location, pages=3)
results = []
for _, job in jobs_df.iterrows():
analysis = score_job_posting(f"{job['title']} {job['snippet']}", job["salary"])
results.append({**job.to_dict(), **analysis})
results_df = pd.DataFrame(results).sort_values("quality_score", ascending=False)
print(f"\nJob Quality Report: {query} in {location}")
print(f"Total: {len(results_df)} | Avg score: {results_df['quality_score'].mean():.0f}/100")
print("\nTop Quality:")
for _, job in results_df.head(5).iterrows():
print(f" [{job['quality_score']}] {job['title']} - {job['company']}")
return results_df
Proxy Infrastructure
- ScraperAPI — JavaScript rendering for Indeed's dynamic pages
- ThorData — Residential proxies for job sites' anti-bot systems
- ScrapeOps — Monitor scraping success rates across job boards
Conclusion
Automated job posting analysis helps seekers focus on legitimate roles. Red flag detection turns unstructured text into actionable quality scores. Start with one board and category, then scale.
Top comments (0)