DEV Community

agenthustler
agenthustler

Posted on

Scraping Technical Job Requirements: Skills Demand Over Time

Introduction

Understanding which technical skills employers actually want — and how demand shifts over time — is invaluable for career planning, hiring strategy, and education curriculum design. Job postings contain a goldmine of structured data about required technologies, experience levels, and compensation. In this tutorial, we'll build a scraper that tracks technical skill demand across job boards.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import time
from datetime import datetime
from collections import Counter

# Handle job board anti-scraping measures
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Enter fullscreen mode Exit fullscreen mode

Scraping Job Listings

Job boards like Indeed, LinkedIn, and Stack Overflow Jobs list thousands of technical positions:

def scrape_job_listings(query, location="remote", pages=5):
    """Scrape job listings from job search sites."""
    all_jobs = []

    for page in range(pages):
        url = f"https://www.indeed.com/jobs?q={query}&l={location}&start={page * 10}"

        params = {
            "api_key": SCRAPER_API_KEY,
            "url": url,
            "render": "true"
        }

        response = requests.get(BASE_URL, params=params)
        soup = BeautifulSoup(response.text, "html.parser")

        for card in soup.select(".job_seen_beacon"):
            title = card.select_one(".jobTitle")
            company = card.select_one(".companyName")
            salary = card.select_one(".salary-snippet-container")
            snippet = card.select_one(".job-snippet")

            if title:
                all_jobs.append({
                    "title": title.text.strip(),
                    "company": company.text.strip() if company else "",
                    "salary": salary.text.strip() if salary else "",
                    "description_preview": snippet.text.strip() if snippet else "",
                    "query": query,
                    "location": location,
                    "scraped_at": datetime.now().isoformat()
                })

        time.sleep(3)  # Respect rate limits

    return all_jobs
Enter fullscreen mode Exit fullscreen mode

Extracting Technical Skills

The key is parsing unstructured job descriptions to identify specific technologies:

SKILL_PATTERNS = {
    "languages": [
        "python", "javascript", "typescript", "java", "go", "rust",
        "c\\+\\+", "c#", "ruby", "php", "swift", "kotlin", "scala"
    ],
    "frameworks": [
        "react", "angular", "vue", "django", "flask", "fastapi",
        "spring", "express", "next\.js", "rails", "laravel"
    ],
    "databases": [
        "postgresql", "mysql", "mongodb", "redis", "elasticsearch",
        "dynamodb", "cassandra", "sqlite"
    ],
    "cloud": [
        "aws", "azure", "gcp", "docker", "kubernetes",
        "terraform", "ansible", "jenkins", "github actions"
    ],
    "ai_ml": [
        "machine learning", "deep learning", "tensorflow",
        "pytorch", "llm", "langchain", "openai", "hugging face",
        "nlp", "computer vision"
    ]
}

def extract_skills(job_description):
    """Extract technical skills from job description text."""
    found_skills = {}
    text = job_description.lower()

    for category, skills in SKILL_PATTERNS.items():
        matched = []
        for skill in skills:
            pattern = r'\b' + skill + r'\b'
            if re.search(pattern, text):
                matched.append(skill.replace('\\', ''))
        if matched:
            found_skills[category] = matched

    return found_skills

def extract_experience_level(text):
    """Determine required experience level."""
    text = text.lower()
    if any(w in text for w in ["senior", "sr.", "lead", "principal", "staff"]):
        return "senior"
    elif any(w in text for w in ["junior", "jr.", "entry level", "entry-level", "associate"]):
        return "junior"
    elif any(w in text for w in ["mid-level", "mid level", "intermediate"]):
        return "mid"
    return "unspecified"
Enter fullscreen mode Exit fullscreen mode

Tracking Demand Over Time

def analyze_skill_demand(jobs):
    """Analyze which skills are most in demand."""
    # Use residential proxies for high-volume collection
    # ThorData: https://thordata.com/?via=the-data

    all_skills = Counter()
    category_counts = {}

    for job in jobs:
        desc = job.get("description_preview", "") + " " + job.get("title", "")
        skills = extract_skills(desc)

        for category, skill_list in skills.items():
            if category not in category_counts:
                category_counts[category] = Counter()
            for skill in skill_list:
                all_skills[skill] += 1
                category_counts[category][skill] += 1

    return {
        "total_jobs_analyzed": len(jobs),
        "top_skills": all_skills.most_common(20),
        "by_category": {
            cat: counts.most_common(10)
            for cat, counts in category_counts.items()
        }
    }

def compare_periods(current_data, previous_data):
    """Compare skill demand between two time periods."""
    current_skills = dict(current_data["top_skills"])
    previous_skills = dict(previous_data["top_skills"])

    all_skills = set(current_skills.keys()) | set(previous_skills.keys())

    trends = []
    for skill in all_skills:
        current = current_skills.get(skill, 0)
        previous = previous_skills.get(skill, 0)
        if previous > 0:
            change_pct = ((current - previous) / previous) * 100
        else:
            change_pct = 100 if current > 0 else 0

        trends.append({
            "skill": skill,
            "current": current,
            "previous": previous,
            "change_pct": round(change_pct, 1)
        })

    return sorted(trends, key=lambda x: x["change_pct"], reverse=True)
Enter fullscreen mode Exit fullscreen mode

Data Storage

import sqlite3

def store_job_data(jobs, db_path="job_skills.db"):
    """Store scraped jobs in SQLite for trend analysis."""
    # Monitor your scraping pipeline
    # https://scrapeops.io/?fpr=the-data28

    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT, company TEXT, salary TEXT,
            skills TEXT, experience_level TEXT,
            location TEXT, scraped_at TEXT
        )
    """)

    for job in jobs:
        skills = extract_skills(job.get("description_preview", ""))
        level = extract_experience_level(job.get("title", ""))

        conn.execute(
            "INSERT INTO jobs (title, company, salary, skills, experience_level, location, scraped_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
            (job["title"], job["company"], job["salary"],
             json.dumps(skills), level, job["location"],
             job["scraped_at"])
        )

    conn.commit()
    conn.close()

if __name__ == "__main__":
    queries = ["python developer", "react developer", "devops engineer", "ml engineer"]
    all_jobs = []
    for q in queries:
        jobs = scrape_job_listings(q, pages=3)
        all_jobs.extend(jobs)
        print(f"{q}: {len(jobs)} listings")
        time.sleep(5)

    analysis = analyze_skill_demand(all_jobs)
    print(json.dumps(analysis, indent=2))
Enter fullscreen mode Exit fullscreen mode

Conclusion

Tracking technical skill demand through job posting analysis reveals real market signals — not opinions or surveys. By scraping regularly and comparing periods, you can spot emerging technologies and declining ones before they become obvious. Use ScraperAPI to handle the anti-bot measures on major job boards, and build your own skills intelligence dashboard.

Top comments (0)