Scraping Job Posting Quality: Identifying Red Flags in JDs

#python #tutorial #webdev #programming

Not all job postings are created equal. Here's how to scrape job boards and automatically score posting quality to identify red flags.

Why Analyze Job Posting Quality?

Job seekers waste hours on poorly written listings that signal unrealistic expectations, toxic culture, ghost jobs, or salary manipulation. Automating this analysis saves applicants time.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random

SCRAPER_KEY = "YOUR_SCRAPERAPI_KEY"

def fetch(url):
    resp = requests.get(
        "http://api.scraperapi.com",
        params={"api_key": SCRAPER_KEY, "url": url, "render": "true"},
        timeout=30
    )
    return BeautifulSoup(resp.text, "html.parser")

Scraping Job Listings

def scrape_indeed_jobs(query, location, pages=5):
    jobs = []

    for page in range(pages):
        start = page * 10
        url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={start}"
        soup = fetch(url)

        cards = soup.find_all("div", class_="job_seen_beacon")
        for card in cards:
            title_el = card.find("h2", class_="jobTitle")
            company_el = card.find("span", attrs={"data-testid": "company-name"})
            snippet_el = card.find("div", class_="job-snippet")
            salary_el = card.find("div", class_="salary-snippet-container")

            jobs.append({
                "title": title_el.text.strip() if title_el else "",
                "company": company_el.text.strip() if company_el else "",
                "snippet": snippet_el.text.strip() if snippet_el else "",
                "salary": salary_el.text.strip() if salary_el else "Not listed",
            })

        time.sleep(random.uniform(2, 5))

    return pd.DataFrame(jobs)

Red Flag Detection Engine

RED_FLAGS = {
    "unrealistic_requirements": {
        "patterns": [r"(\d{2}\+?)\s*years?.*(experience|exp)"],
        "weight": 3
    },
    "toxic_culture_signals": {
        "keywords": [
            "rockstar", "ninja", "guru", "unicorn",
            "fast-paced environment", "wear many hats",
            "we work hard and play hard", "like a family",
            "unlimited PTO", "competitive salary"
        ],
        "weight": 2
    },
    "salary_red_flags": {
        "conditions": ["salary_missing", "range_too_wide"],
        "weight": 2
    },
    "ghost_job_signals": {
        "keywords": ["ongoing recruitment", "talent pool", "future opportunities"],
        "weight": 3
    }
}

def score_job_posting(job_text, salary_info):
    flags = []
    total_score = 0
    text_lower = job_text.lower()

    for flag_name, config in RED_FLAGS.items():
        flag_hits = 0

        for pattern in config.get("patterns", []):
            flag_hits += len(re.findall(pattern, text_lower))

        for keyword in config.get("keywords", []):
            if keyword.lower() in text_lower:
                flag_hits += 1

        for condition in config.get("conditions", []):
            if condition == "salary_missing" and salary_info == "Not listed":
                flag_hits += 1

        if flag_hits > 0:
            flags.append(flag_name)
            total_score += flag_hits * config["weight"]

    return {"quality_score": max(0, 100 - total_score * 5), "flags": flags}

Analyzing Job Markets

def analyze_job_market(query, location):
    jobs_df = scrape_indeed_jobs(query, location, pages=3)

    results = []
    for _, job in jobs_df.iterrows():
        analysis = score_job_posting(f"{job['title']} {job['snippet']}", job["salary"])
        results.append({**job.to_dict(), **analysis})

    results_df = pd.DataFrame(results).sort_values("quality_score", ascending=False)

    print(f"\nJob Quality Report: {query} in {location}")
    print(f"Total: {len(results_df)} | Avg score: {results_df['quality_score'].mean():.0f}/100")

    print("\nTop Quality:")
    for _, job in results_df.head(5).iterrows():
        print(f"  [{job['quality_score']}] {job['title']} - {job['company']}")

    return results_df

Proxy Infrastructure

ScraperAPI — JavaScript rendering for Indeed's dynamic pages
ThorData — Residential proxies for job sites' anti-bot systems
ScrapeOps — Monitor scraping success rates across job boards

Conclusion

Automated job posting analysis helps seekers focus on legitimate roles. Red flag detection turns unstructured text into actionable quality scores. Start with one board and category, then scale.