DEV Community

agenthustler
agenthustler

Posted on

Scraping Hacker News Job Postings: Tech Hiring Trends Analysis

Every month, HN publishes a "Who's Hiring?" thread with hundreds of real job postings. This data reveals what technologies companies actually use and what skills they pay for.

Fetching via the HN API

import requests, re, time
from collections import Counter

class HNJobScraper:
    def __init__(self):
        self.api = "https://hacker-news.firebaseio.com/v0"

    def find_threads(self, months=6):
        user = requests.get(f"{self.api}/user/whoishiring.json").json()
        ids = []
        for sid in user["submitted"][:50]:
            item = requests.get(f"{self.api}/item/{sid}.json").json()
            if item and "who is hiring" in item.get("title","").lower():
                ids.append(sid)
                if len(ids) >= months: break
            time.sleep(0.2)
        return ids

    def scrape_thread(self, tid):
        thread = requests.get(f"{self.api}/item/{tid}.json").json()
        posts = []
        for cid in thread.get("kids", []):
            c = requests.get(f"{self.api}/item/{cid}.json").json()
            if c and not c.get("deleted"):
                posts.append(self.parse(c.get("text", "")))
            time.sleep(0.1)
        return posts
Enter fullscreen mode Exit fullscreen mode

Parsing Postings

    def parse(self, html):
        from bs4 import BeautifulSoup
        text = BeautifulSoup(html, "html.parser").get_text()
        p = {"technologies": [], "remote": False, "salary": None}
        if re.search(r"\b(remote|wfh|hybrid)\b", text, re.I): p["remote"] = True

        m = re.search(r"\$([\d,]+)[kK]?\s*[-\u2013]\s*\$?([\d,]+)[kK]?", text)
        if m:
            lo, hi = int(m.group(1).replace(",","")), int(m.group(2).replace(",",""))
            if lo < 1000: lo *= 1000
            if hi < 1000: hi *= 1000
            p["salary"] = {"low": lo, "high": hi}

        techs = {"Python":r"\bpython\b","Rust":r"\brust\b","Go":r"\bgolang\b|\bgo\b",
                 "TypeScript":r"\btypescript\b","React":r"\breact\b","PostgreSQL":r"\bpostgres",
                 "K8s":r"\bk8s\b|\bkubernetes\b","AWS":r"\baws\b","Docker":r"\bdocker\b",
                 "AI/ML":r"\b(machine learning|ml|ai|llm|gpt)\b"}
        for t, pat in techs.items():
            if re.search(pat, text, re.I): p["technologies"].append(t)
        return p
Enter fullscreen mode Exit fullscreen mode

Analyzing Trends

    def analyze(self, posts):
        tc = Counter()
        salaries, remote = [], 0
        for p in posts:
            for t in p["technologies"]: tc[t] += 1
            if p["salary"]: salaries.append(p["salary"])
            if p["remote"]: remote += 1
        n = len(posts)
        print(f"Total: {n} | Remote: {remote} ({remote/n*100:.0f}%)")
        for t, c in tc.most_common(15):
            print(f"  {t:15s} {c:4d} ({c/n*100:.1f}%)")
        if salaries:
            print(f"Avg salary: ${sum(s['low'] for s in salaries)/len(salaries):,.0f}"
                  f" - ${sum(s['high'] for s in salaries)/len(salaries):,.0f}")

s = HNJobScraper()
all_posts = []
for tid in s.find_threads(6):
    all_posts.extend(s.scrape_thread(tid))
    time.sleep(1)
s.analyze(all_posts)
Enter fullscreen mode Exit fullscreen mode

Scaling

Use ScraperAPI for reliable requests, ScrapeOps for monitoring, and ThorData for career page proxies.

Insights

  • Technology momentum quarter over quarter
  • Real salary benchmarks by stack and location
  • Remote work trend tracking
  • Startup hiring patterns revealed through job posts

Top comments (0)