DEV Community

agenthustler
agenthustler

Posted on

How to Monitor Infrastructure Changes with Government Data Scraping

Infrastructure Data is Public Gold

Governments publish massive amounts of infrastructure data: building permits, road projects, zoning changes. This data drives investment decisions worth billions.

What to Monitor

  • Building permits signal new construction and growth
  • Zoning changes hint at major developments
  • Transportation projects indicate expansion corridors
  • Federal spending contracts reveal priorities

Setup

pip install requests beautifulsoup4 pandas schedule
Enter fullscreen mode Exit fullscreen mode

Government sites are inconsistent. ScraperAPI handles legacy portal rendering quirks.

Scraping Building Permits

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_building_permits(city_portal_url):
    url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={city_portal_url}&render=true"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    permits = []
    for row in soup.select("table.permits tr, .permit-row"):
        cols = row.select("td")
        if len(cols) >= 4:
            permits.append({
                "permit_number": cols[0].text.strip(),
                "address": cols[1].text.strip(),
                "type": cols[2].text.strip(),
                "value": parse_currency(cols[3].text),
                "scraped_at": datetime.now().isoformat()
            })
    return pd.DataFrame(permits)

def parse_currency(text):
    match = re.search(r"[\d.]+", text.replace(",", ""))
    return float(match.group()) if match else 0
Enter fullscreen mode Exit fullscreen mode

Federal Infrastructure Spending via USASpending API

def scrape_usaspending(keyword, fiscal_year=2026):
    url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
    payload = {
        "filters": {
            "keywords": [keyword],
            "time_period": [{"start_date": f"{fiscal_year}-01-01", "end_date": f"{fiscal_year}-12-31"}],
            "award_type_codes": ["A", "B", "C", "D"]
        },
        "fields": ["Award ID", "Recipient Name", "Award Amount", "Description"],
        "limit": 100,
        "page": 1
    }
    response = requests.post(url, json=payload)
    data = response.json()
    contracts = []
    for r in data.get("results", []):
        contracts.append({
            "award_id": r.get("Award ID"),
            "recipient": r.get("Recipient Name"),
            "amount": r.get("Award Amount"),
            "description": r.get("Description")
        })
    return pd.DataFrame(contracts)
Enter fullscreen mode Exit fullscreen mode

Change Detection Engine

import hashlib
import json
import os

STATE_FILE = "infra_state.json"

def detect_changes(current_data, source_name):
    current_hash = hashlib.md5(current_data.to_json().encode()).hexdigest()
    prev = load_state()
    prev_hash = prev.get(source_name, {}).get("hash")

    if prev_hash != current_hash:
        prev_ids = set(prev.get(source_name, {}).get("ids", []))
        curr_ids = set(current_data.iloc[:, 0].tolist())
        new_entries = curr_ids - prev_ids
        prev[source_name] = {"hash": current_hash, "ids": list(curr_ids)}
        save_state(prev)
        return list(new_entries)
    return []

def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE) as f:
            return json.load(f)
    return {}

def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)
Enter fullscreen mode Exit fullscreen mode

Scheduled Monitoring

import schedule
import time

def monitoring_job():
    sources = {
        "nyc_permits": "https://data.cityofnewyork.us/permits",
        "federal_infra": None
    }
    for name, url in sources.items():
        try:
            data = scrape_building_permits(url) if url else scrape_usaspending("infrastructure")
            changes = detect_changes(data, name)
            if changes:
                print(f"New entries in {name}: {len(changes)}")
        except Exception as e:
            print(f"Error {name}: {e}")

schedule.every(6).hours.do(monitoring_job)
while True:
    schedule.run_pending()
    time.sleep(60)
Enter fullscreen mode Exit fullscreen mode

Proxy Considerations

Government sites vary in sophistication. Use ThorData residential proxies for sites blocking datacenter IPs. Monitor with ScrapeOps.

Conclusion

Infrastructure monitoring gives early market signals. With Python and ScraperAPI, you can track billions in spending as it happens.

Top comments (0)