DEV Community

agenthustler
agenthustler

Posted on

Scraping Job Posting Quality: Identifying Red Flags in JDs

Not all job postings are created equal. Here's how to scrape job boards and automatically score posting quality to identify red flags.

Why Analyze Job Posting Quality?

Job seekers waste hours on poorly written listings that signal unrealistic expectations, toxic culture, ghost jobs, or salary manipulation. Automating this analysis saves applicants time.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random

SCRAPER_KEY = "YOUR_SCRAPERAPI_KEY"

def fetch(url):
    resp = requests.get(
        "http://api.scraperapi.com",
        params={"api_key": SCRAPER_KEY, "url": url, "render": "true"},
        timeout=30
    )
    return BeautifulSoup(resp.text, "html.parser")
Enter fullscreen mode Exit fullscreen mode

Scraping Job Listings

def scrape_indeed_jobs(query, location, pages=5):
    jobs = []

    for page in range(pages):
        start = page * 10
        url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={start}"
        soup = fetch(url)

        cards = soup.find_all("div", class_="job_seen_beacon")
        for card in cards:
            title_el = card.find("h2", class_="jobTitle")
            company_el = card.find("span", attrs={"data-testid": "company-name"})
            snippet_el = card.find("div", class_="job-snippet")
            salary_el = card.find("div", class_="salary-snippet-container")

            jobs.append({
                "title": title_el.text.strip() if title_el else "",
                "company": company_el.text.strip() if company_el else "",
                "snippet": snippet_el.text.strip() if snippet_el else "",
                "salary": salary_el.text.strip() if salary_el else "Not listed",
            })

        time.sleep(random.uniform(2, 5))

    return pd.DataFrame(jobs)
Enter fullscreen mode Exit fullscreen mode

Red Flag Detection Engine

RED_FLAGS = {
    "unrealistic_requirements": {
        "patterns": [r"(\d{2}\+?)\s*years?.*(experience|exp)"],
        "weight": 3
    },
    "toxic_culture_signals": {
        "keywords": [
            "rockstar", "ninja", "guru", "unicorn",
            "fast-paced environment", "wear many hats",
            "we work hard and play hard", "like a family",
            "unlimited PTO", "competitive salary"
        ],
        "weight": 2
    },
    "salary_red_flags": {
        "conditions": ["salary_missing", "range_too_wide"],
        "weight": 2
    },
    "ghost_job_signals": {
        "keywords": ["ongoing recruitment", "talent pool", "future opportunities"],
        "weight": 3
    }
}

def score_job_posting(job_text, salary_info):
    flags = []
    total_score = 0
    text_lower = job_text.lower()

    for flag_name, config in RED_FLAGS.items():
        flag_hits = 0

        for pattern in config.get("patterns", []):
            flag_hits += len(re.findall(pattern, text_lower))

        for keyword in config.get("keywords", []):
            if keyword.lower() in text_lower:
                flag_hits += 1

        for condition in config.get("conditions", []):
            if condition == "salary_missing" and salary_info == "Not listed":
                flag_hits += 1

        if flag_hits > 0:
            flags.append(flag_name)
            total_score += flag_hits * config["weight"]

    return {"quality_score": max(0, 100 - total_score * 5), "flags": flags}
Enter fullscreen mode Exit fullscreen mode

Analyzing Job Markets

def analyze_job_market(query, location):
    jobs_df = scrape_indeed_jobs(query, location, pages=3)

    results = []
    for _, job in jobs_df.iterrows():
        analysis = score_job_posting(f"{job['title']} {job['snippet']}", job["salary"])
        results.append({**job.to_dict(), **analysis})

    results_df = pd.DataFrame(results).sort_values("quality_score", ascending=False)

    print(f"\nJob Quality Report: {query} in {location}")
    print(f"Total: {len(results_df)} | Avg score: {results_df['quality_score'].mean():.0f}/100")

    print("\nTop Quality:")
    for _, job in results_df.head(5).iterrows():
        print(f"  [{job['quality_score']}] {job['title']} - {job['company']}")

    return results_df
Enter fullscreen mode Exit fullscreen mode

Proxy Infrastructure

  • ScraperAPI — JavaScript rendering for Indeed's dynamic pages
  • ThorData — Residential proxies for job sites' anti-bot systems
  • ScrapeOps — Monitor scraping success rates across job boards

Conclusion

Automated job posting analysis helps seekers focus on legitimate roles. Red flag detection turns unstructured text into actionable quality scores. Start with one board and category, then scale.

Top comments (0)