How to Build a Talent Intelligence Platform with Job Data Scraping

#python #tutorial #webdev #programming

Talent intelligence — understanding who is hiring, for what roles, at what salaries — is a billion-dollar industry. Companies like LinkedIn Talent Insights charge thousands per seat. But the raw data is publicly available on job boards.

Here's how to build your own talent intelligence platform by scraping job postings and analyzing hiring trends.

The Business Case

Talent intelligence answers questions like:

Which companies are scaling their AI teams?
What's the real salary range for senior engineers in Austin?
Which skills are trending up vs. declining?
How long do positions stay open (time-to-fill)?

Recruiters, HR teams, and investors all pay for these insights.

Setting Up

pip install requests beautifulsoup4 pandas scikit-learn

Scraping Job Listings

We'll target publicly accessible job boards. Use ScraperAPI for proxy rotation:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import time

SCRAPER_API = "https://api.scraperapi.com"
API_KEY = "YOUR_SCRAPERAPI_KEY"

def scrape_job_listings(query: str, location: str, pages: int = 5) -> list[dict]:
    """Scrape job listings from a public job board."""
    jobs = []

    for page in range(pages):
        url = f"https://example-jobboard.com/jobs?q={query}&l={location}&start={page * 10}"

        resp = requests.get(SCRAPER_API, params={
            "api_key": API_KEY,
            "url": url,
            "render": "true"
        }, timeout=60)

        soup = BeautifulSoup(resp.text, "html.parser")

        for card in soup.select(".job-card"):
            job = {
                "title": card.select_one(".job-title").get_text(strip=True),
                "company": card.select_one(".company-name").get_text(strip=True),
                "location": card.select_one(".location").get_text(strip=True),
                "salary": extract_salary(card),
                "posted_date": card.select_one(".date").get_text(strip=True),
                "description_snippet": card.select_one(".snippet").get_text(strip=True),
                "url": card.select_one("a")["href"],
                "scraped_at": datetime.utcnow().isoformat()
            }
            jobs.append(job)

        time.sleep(2)

    return jobs

def extract_salary(card) -> dict:
    """Extract and normalize salary information."""
    salary_el = card.select_one(".salary")
    if not salary_el:
        return {"min": None, "max": None, "period": None}

    text = salary_el.get_text(strip=True)

    # Match patterns like $80,000 - $120,000 a year
    match = re.search(r"\$(\d[\d,]+)\s*[-–]\s*\$(\d[\d,]+)\s*(\w+)", text)
    if match:
        return {
            "min": int(match.group(1).replace(",", "")),
            "max": int(match.group(2).replace(",", "")),
            "period": match.group(3)
        }
    return {"raw": text}

Skill Extraction with NLP

Extract in-demand skills from job descriptions:

SKILL_PATTERNS = {
    "languages": ["python", "javascript", "typescript", "go", "rust", "java", "c++", "ruby", "swift", "kotlin"],
    "frameworks": ["react", "django", "fastapi", "flask", "nextjs", "vue", "angular", "spring", "rails"],
    "cloud": ["aws", "gcp", "azure", "kubernetes", "docker", "terraform", "cloudflare"],
    "data": ["postgresql", "mongodb", "redis", "elasticsearch", "kafka", "spark", "airflow"],
    "ai_ml": ["pytorch", "tensorflow", "llm", "transformers", "langchain", "openai", "rag", "fine-tuning"]
}

def extract_skills(description: str) -> dict[str, list[str]]:
    """Extract technical skills from job description text."""
    description_lower = description.lower()
    found = {}

    for category, skills in SKILL_PATTERNS.items():
        matches = [s for s in skills if s in description_lower]
        if matches:
            found[category] = matches

    return found

def analyze_skill_demand(jobs: list[dict]) -> pd.DataFrame:
    """Aggregate skill demand across all job listings."""
    skill_counts = {}

    for job in jobs:
        skills = extract_skills(job.get("description_snippet", ""))
        for category, skill_list in skills.items():
            for skill in skill_list:
                key = f"{category}:{skill}"
                skill_counts[key] = skill_counts.get(key, 0) + 1

    df = pd.DataFrame([
        {"skill": k.split(":")[1], "category": k.split(":")[0], "count": v}
        for k, v in skill_counts.items()
    ]).sort_values("count", ascending=False)

    return df

Company Hiring Velocity Tracker

def track_hiring_velocity(jobs: list[dict]) -> pd.DataFrame:
    """Calculate hiring velocity per company."""
    df = pd.DataFrame(jobs)

    velocity = df.groupby("company").agg(
        open_positions=("title", "count"),
        unique_roles=("title", "nunique"),
        locations=("location", "nunique"),
        avg_salary_max=("salary", lambda x: pd.json_normalize(x.tolist())["max"].mean())
    ).sort_values("open_positions", ascending=False)

    velocity["hiring_intensity"] = velocity["open_positions"] * velocity["unique_roles"]

    return velocity.head(20)

def generate_market_report(query: str, location: str):
    """Generate a complete talent intelligence report."""
    print(f"Scraping jobs for: {query} in {location}")
    jobs = scrape_job_listings(query, location, pages=5)

    print(f"\nFound {len(jobs)} listings")

    # Skill analysis
    skills = analyze_skill_demand(jobs)
    print(f"\n=== Top Skills in Demand ===")
    print(skills.head(15).to_string(index=False))

    # Hiring velocity
    velocity = track_hiring_velocity(jobs)
    print(f"\n=== Fastest Hiring Companies ===")
    print(velocity.head(10).to_string())

    # Salary analysis
    df = pd.DataFrame(jobs)
    salary_data = pd.json_normalize(df["salary"].tolist())
    if "max" in salary_data.columns:
        avg_max = salary_data["max"].dropna().mean()
        print(f"\nAverage max salary: ${avg_max:,.0f}")

generate_market_report("software engineer", "san francisco")

Scaling the Platform

For production deployment with hundreds of queries across multiple job boards, ScraperAPI handles JavaScript rendering and CAPTCHA solving. Use ThorData residential proxies for boards that block datacenter IPs.

Monitor scraper health across all your job board sources with ScrapeOps — especially important when boards change their HTML structure.

Monetization Paths

SaaS dashboard for recruiters ($99-499/mo)
API access for HR tech companies
Weekly market reports for hiring managers
Custom research for investors doing due diligence on startups

The talent intelligence market is growing fast, and the barrier to entry is knowing how to scrape and analyze the data. Now you do.

DEV Community