Scraping Hacker News Job Postings: Tech Hiring Trends Analysis

#python #tutorial #webdev #programming

Every month, HN publishes a "Who's Hiring?" thread with hundreds of real job postings. This data reveals what technologies companies actually use and what skills they pay for.

Fetching via the HN API

import requests, re, time
from collections import Counter

class HNJobScraper:
    def __init__(self):
        self.api = "https://hacker-news.firebaseio.com/v0"

    def find_threads(self, months=6):
        user = requests.get(f"{self.api}/user/whoishiring.json").json()
        ids = []
        for sid in user["submitted"][:50]:
            item = requests.get(f"{self.api}/item/{sid}.json").json()
            if item and "who is hiring" in item.get("title","").lower():
                ids.append(sid)
                if len(ids) >= months: break
            time.sleep(0.2)
        return ids

    def scrape_thread(self, tid):
        thread = requests.get(f"{self.api}/item/{tid}.json").json()
        posts = []
        for cid in thread.get("kids", []):
            c = requests.get(f"{self.api}/item/{cid}.json").json()
            if c and not c.get("deleted"):
                posts.append(self.parse(c.get("text", "")))
            time.sleep(0.1)
        return posts

Parsing Postings

    def parse(self, html):
        from bs4 import BeautifulSoup
        text = BeautifulSoup(html, "html.parser").get_text()
        p = {"technologies": [], "remote": False, "salary": None}
        if re.search(r"\b(remote|wfh|hybrid)\b", text, re.I): p["remote"] = True

        m = re.search(r"\$([\d,]+)[kK]?\s*[-\u2013]\s*\$?([\d,]+)[kK]?", text)
        if m:
            lo, hi = int(m.group(1).replace(",","")), int(m.group(2).replace(",",""))
            if lo < 1000: lo *= 1000
            if hi < 1000: hi *= 1000
            p["salary"] = {"low": lo, "high": hi}

        techs = {"Python":r"\bpython\b","Rust":r"\brust\b","Go":r"\bgolang\b|\bgo\b",
                 "TypeScript":r"\btypescript\b","React":r"\breact\b","PostgreSQL":r"\bpostgres",
                 "K8s":r"\bk8s\b|\bkubernetes\b","AWS":r"\baws\b","Docker":r"\bdocker\b",
                 "AI/ML":r"\b(machine learning|ml|ai|llm|gpt)\b"}
        for t, pat in techs.items():
            if re.search(pat, text, re.I): p["technologies"].append(t)
        return p

Analyzing Trends

    def analyze(self, posts):
        tc = Counter()
        salaries, remote = [], 0
        for p in posts:
            for t in p["technologies"]: tc[t] += 1
            if p["salary"]: salaries.append(p["salary"])
            if p["remote"]: remote += 1
        n = len(posts)
        print(f"Total: {n} | Remote: {remote} ({remote/n*100:.0f}%)")
        for t, c in tc.most_common(15):
            print(f"  {t:15s} {c:4d} ({c/n*100:.1f}%)")
        if salaries:
            print(f"Avg salary: ${sum(s['low'] for s in salaries)/len(salaries):,.0f}"
                  f" - ${sum(s['high'] for s in salaries)/len(salaries):,.0f}")

s = HNJobScraper()
all_posts = []
for tid in s.find_threads(6):
    all_posts.extend(s.scrape_thread(tid))
    time.sleep(1)
s.analyze(all_posts)

Scaling

Use ScraperAPI for reliable requests, ScrapeOps for monitoring, and ThorData for career page proxies.

Insights

Technology momentum quarter over quarter
Real salary benchmarks by stack and location
Remote work trend tracking
Startup hiring patterns revealed through job posts

DEV Community