Scraping Technical Job Requirements: Skills Demand Over Time

#webdev #programming #python #tutorial

Introduction

Understanding which technical skills employers actually want — and how demand shifts over time — is invaluable for career planning, hiring strategy, and education curriculum design. Job postings contain a goldmine of structured data about required technologies, experience levels, and compensation. In this tutorial, we'll build a scraper that tracks technical skill demand across job boards.

Setup

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Scraping Job Listings

Job boards like Indeed, LinkedIn, and Stack Overflow Jobs list thousands of technical positions:

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Extracting Technical Skills

The key is parsing unstructured job descriptions to identify specific technologies:

SKILL_PATTERNS = {
    "languages": [
        "python", "javascript", "typescript", "java", "go", "rust",
        "c\\+\\+", "c#", "ruby", "php", "swift", "kotlin", "scala"
    ],
    "frameworks": [
        "react", "angular", "vue", "django", "flask", "fastapi",
        "spring", "express", "next\.js", "rails", "laravel"
    ],
    "databases": [
        "postgresql", "mysql", "mongodb", "redis", "elasticsearch",
        "dynamodb", "cassandra", "sqlite"
    ],
    "cloud": [
        "aws", "azure", "gcp", "docker", "kubernetes",
        "terraform", "ansible", "jenkins", "github actions"
    ],
    "ai_ml": [
        "machine learning", "deep learning", "tensorflow",
        "pytorch", "llm", "langchain", "openai", "hugging face",
        "nlp", "computer vision"
    ]
}

def extract_skills(job_description):
    """Extract technical skills from job description text."""
    found_skills = {}
    text = job_description.lower()

    for category, skills in SKILL_PATTERNS.items():
        matched = []
        for skill in skills:
            pattern = r'\b' + skill + r'\b'
            if re.search(pattern, text):
                matched.append(skill.replace('\\', ''))
        if matched:
            found_skills[category] = matched

    return found_skills

def extract_experience_level(text):
    """Determine required experience level."""
    text = text.lower()
    if any(w in text for w in ["senior", "sr.", "lead", "principal", "staff"]):
        return "senior"
    elif any(w in text for w in ["junior", "jr.", "entry level", "entry-level", "associate"]):
        return "junior"
    elif any(w in text for w in ["mid-level", "mid level", "intermediate"]):
        return "mid"
    return "unspecified"

Tracking Demand Over Time

def analyze_skill_demand(jobs):
    """Analyze which skills are most in demand."""
    # Use residential proxies for high-volume collection
    # ThorData: https://thordata.com/?via=the-data

    all_skills = Counter()
    category_counts = {}

    for job in jobs:
        desc = job.get("description_preview", "") + " " + job.get("title", "")
        skills = extract_skills(desc)

        for category, skill_list in skills.items():
            if category not in category_counts:
                category_counts[category] = Counter()
            for skill in skill_list:
                all_skills[skill] += 1
                category_counts[category][skill] += 1

    return {
        "total_jobs_analyzed": len(jobs),
        "top_skills": all_skills.most_common(20),
        "by_category": {
            cat: counts.most_common(10)
            for cat, counts in category_counts.items()
        }
    }

def compare_periods(current_data, previous_data):
    """Compare skill demand between two time periods."""
    current_skills = dict(current_data["top_skills"])
    previous_skills = dict(previous_data["top_skills"])

    all_skills = set(current_skills.keys()) | set(previous_skills.keys())

    trends = []
    for skill in all_skills:
        current = current_skills.get(skill, 0)
        previous = previous_skills.get(skill, 0)
        if previous > 0:
            change_pct = ((current - previous) / previous) * 100
        else:
            change_pct = 100 if current > 0 else 0

        trends.append({
            "skill": skill,
            "current": current,
            "previous": previous,
            "change_pct": round(change_pct, 1)
        })

    return sorted(trends, key=lambda x: x["change_pct"], reverse=True)

Data Storage

import sqlite3

def store_job_data(jobs, db_path="job_skills.db"):
    """Store scraped jobs in SQLite for trend analysis."""
    # Monitor your scraping pipeline
    # https://scrapeops.io/?fpr=the-data28

    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS jobs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT, company TEXT, salary TEXT,
            skills TEXT, experience_level TEXT,
            location TEXT, scraped_at TEXT
        )
    """)

    for job in jobs:
        skills = extract_skills(job.get("description_preview", ""))
        level = extract_experience_level(job.get("title", ""))

        conn.execute(
            "INSERT INTO jobs (title, company, salary, skills, experience_level, location, scraped_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
            (job["title"], job["company"], job["salary"],
             json.dumps(skills), level, job["location"],
             job["scraped_at"])
        )

    conn.commit()
    conn.close()

if __name__ == "__main__":
    queries = ["python developer", "react developer", "devops engineer", "ml engineer"]
    all_jobs = []
    for q in queries:
        jobs = scrape_job_listings(q, pages=3)
        all_jobs.extend(jobs)
        print(f"{q}: {len(jobs)} listings")
        time.sleep(5)

    analysis = analyze_skill_demand(all_jobs)
    print(json.dumps(analysis, indent=2))

Conclusion

Tracking technical skill demand through job posting analysis reveals real market signals — not opinions or surveys. By scraping regularly and comparing periods, you can spot emerging technologies and declining ones before they become obvious. Use ScraperAPI to handle the anti-bot measures on major job boards, and build your own skills intelligence dashboard.