Talent intelligence — understanding who is hiring, for what roles, at what salaries — is a billion-dollar industry. Companies like LinkedIn Talent Insights charge thousands per seat. But the raw data is publicly available on job boards.
Here's how to build your own talent intelligence platform by scraping job postings and analyzing hiring trends.
The Business Case
Talent intelligence answers questions like:
- Which companies are scaling their AI teams?
- What's the real salary range for senior engineers in Austin?
- Which skills are trending up vs. declining?
- How long do positions stay open (time-to-fill)?
Recruiters, HR teams, and investors all pay for these insights.
Setting Up
pip install requests beautifulsoup4 pandas scikit-learn
Scraping Job Listings
We'll target publicly accessible job boards. Use ScraperAPI for proxy rotation:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import time
SCRAPER_API = "https://api.scraperapi.com"
API_KEY = "YOUR_SCRAPERAPI_KEY"
def scrape_job_listings(query: str, location: str, pages: int = 5) -> list[dict]:
"""Scrape job listings from a public job board."""
jobs = []
for page in range(pages):
url = f"https://example-jobboard.com/jobs?q={query}&l={location}&start={page * 10}"
resp = requests.get(SCRAPER_API, params={
"api_key": API_KEY,
"url": url,
"render": "true"
}, timeout=60)
soup = BeautifulSoup(resp.text, "html.parser")
for card in soup.select(".job-card"):
job = {
"title": card.select_one(".job-title").get_text(strip=True),
"company": card.select_one(".company-name").get_text(strip=True),
"location": card.select_one(".location").get_text(strip=True),
"salary": extract_salary(card),
"posted_date": card.select_one(".date").get_text(strip=True),
"description_snippet": card.select_one(".snippet").get_text(strip=True),
"url": card.select_one("a")["href"],
"scraped_at": datetime.utcnow().isoformat()
}
jobs.append(job)
time.sleep(2)
return jobs
def extract_salary(card) -> dict:
"""Extract and normalize salary information."""
salary_el = card.select_one(".salary")
if not salary_el:
return {"min": None, "max": None, "period": None}
text = salary_el.get_text(strip=True)
# Match patterns like $80,000 - $120,000 a year
match = re.search(r"\$(\d[\d,]+)\s*[-–]\s*\$(\d[\d,]+)\s*(\w+)", text)
if match:
return {
"min": int(match.group(1).replace(",", "")),
"max": int(match.group(2).replace(",", "")),
"period": match.group(3)
}
return {"raw": text}
Skill Extraction with NLP
Extract in-demand skills from job descriptions:
SKILL_PATTERNS = {
"languages": ["python", "javascript", "typescript", "go", "rust", "java", "c++", "ruby", "swift", "kotlin"],
"frameworks": ["react", "django", "fastapi", "flask", "nextjs", "vue", "angular", "spring", "rails"],
"cloud": ["aws", "gcp", "azure", "kubernetes", "docker", "terraform", "cloudflare"],
"data": ["postgresql", "mongodb", "redis", "elasticsearch", "kafka", "spark", "airflow"],
"ai_ml": ["pytorch", "tensorflow", "llm", "transformers", "langchain", "openai", "rag", "fine-tuning"]
}
def extract_skills(description: str) -> dict[str, list[str]]:
"""Extract technical skills from job description text."""
description_lower = description.lower()
found = {}
for category, skills in SKILL_PATTERNS.items():
matches = [s for s in skills if s in description_lower]
if matches:
found[category] = matches
return found
def analyze_skill_demand(jobs: list[dict]) -> pd.DataFrame:
"""Aggregate skill demand across all job listings."""
skill_counts = {}
for job in jobs:
skills = extract_skills(job.get("description_snippet", ""))
for category, skill_list in skills.items():
for skill in skill_list:
key = f"{category}:{skill}"
skill_counts[key] = skill_counts.get(key, 0) + 1
df = pd.DataFrame([
{"skill": k.split(":")[1], "category": k.split(":")[0], "count": v}
for k, v in skill_counts.items()
]).sort_values("count", ascending=False)
return df
Company Hiring Velocity Tracker
def track_hiring_velocity(jobs: list[dict]) -> pd.DataFrame:
"""Calculate hiring velocity per company."""
df = pd.DataFrame(jobs)
velocity = df.groupby("company").agg(
open_positions=("title", "count"),
unique_roles=("title", "nunique"),
locations=("location", "nunique"),
avg_salary_max=("salary", lambda x: pd.json_normalize(x.tolist())["max"].mean())
).sort_values("open_positions", ascending=False)
velocity["hiring_intensity"] = velocity["open_positions"] * velocity["unique_roles"]
return velocity.head(20)
def generate_market_report(query: str, location: str):
"""Generate a complete talent intelligence report."""
print(f"Scraping jobs for: {query} in {location}")
jobs = scrape_job_listings(query, location, pages=5)
print(f"\nFound {len(jobs)} listings")
# Skill analysis
skills = analyze_skill_demand(jobs)
print(f"\n=== Top Skills in Demand ===")
print(skills.head(15).to_string(index=False))
# Hiring velocity
velocity = track_hiring_velocity(jobs)
print(f"\n=== Fastest Hiring Companies ===")
print(velocity.head(10).to_string())
# Salary analysis
df = pd.DataFrame(jobs)
salary_data = pd.json_normalize(df["salary"].tolist())
if "max" in salary_data.columns:
avg_max = salary_data["max"].dropna().mean()
print(f"\nAverage max salary: ${avg_max:,.0f}")
generate_market_report("software engineer", "san francisco")
Scaling the Platform
For production deployment with hundreds of queries across multiple job boards, ScraperAPI handles JavaScript rendering and CAPTCHA solving. Use ThorData residential proxies for boards that block datacenter IPs.
Monitor scraper health across all your job board sources with ScrapeOps — especially important when boards change their HTML structure.
Monetization Paths
- SaaS dashboard for recruiters ($99-499/mo)
- API access for HR tech companies
- Weekly market reports for hiring managers
- Custom research for investors doing due diligence on startups
The talent intelligence market is growing fast, and the barrier to entry is knowing how to scrape and analyze the data. Now you do.
Top comments (0)