How to Monitor Infrastructure Changes with Government Data Scraping

#python #tutorial #webdev #programming

Infrastructure Data is Public Gold

Governments publish massive amounts of infrastructure data: building permits, road projects, zoning changes. This data drives investment decisions worth billions.

What to Monitor

Building permits signal new construction and growth
Zoning changes hint at major developments
Transportation projects indicate expansion corridors
Federal spending contracts reveal priorities

Setup

pip install requests beautifulsoup4 pandas schedule

Government sites are inconsistent. ScraperAPI handles legacy portal rendering quirks.

Scraping Building Permits

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_building_permits(city_portal_url):
    url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={city_portal_url}&render=true"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    permits = []
    for row in soup.select("table.permits tr, .permit-row"):
        cols = row.select("td")
        if len(cols) >= 4:
            permits.append({
                "permit_number": cols[0].text.strip(),
                "address": cols[1].text.strip(),
                "type": cols[2].text.strip(),
                "value": parse_currency(cols[3].text),
                "scraped_at": datetime.now().isoformat()
            })
    return pd.DataFrame(permits)

def parse_currency(text):
    match = re.search(r"[\d.]+", text.replace(",", ""))
    return float(match.group()) if match else 0

Federal Infrastructure Spending via USASpending API

def scrape_usaspending(keyword, fiscal_year=2026):
    url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
    payload = {
        "filters": {
            "keywords": [keyword],
            "time_period": [{"start_date": f"{fiscal_year}-01-01", "end_date": f"{fiscal_year}-12-31"}],
            "award_type_codes": ["A", "B", "C", "D"]
        },
        "fields": ["Award ID", "Recipient Name", "Award Amount", "Description"],
        "limit": 100,
        "page": 1
    }
    response = requests.post(url, json=payload)
    data = response.json()
    contracts = []
    for r in data.get("results", []):
        contracts.append({
            "award_id": r.get("Award ID"),
            "recipient": r.get("Recipient Name"),
            "amount": r.get("Award Amount"),
            "description": r.get("Description")
        })
    return pd.DataFrame(contracts)

Change Detection Engine

import hashlib
import json
import os

STATE_FILE = "infra_state.json"

def detect_changes(current_data, source_name):
    current_hash = hashlib.md5(current_data.to_json().encode()).hexdigest()
    prev = load_state()
    prev_hash = prev.get(source_name, {}).get("hash")

    if prev_hash != current_hash:
        prev_ids = set(prev.get(source_name, {}).get("ids", []))
        curr_ids = set(current_data.iloc[:, 0].tolist())
        new_entries = curr_ids - prev_ids
        prev[source_name] = {"hash": current_hash, "ids": list(curr_ids)}
        save_state(prev)
        return list(new_entries)
    return []

def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE) as f:
            return json.load(f)
    return {}

def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)

Scheduled Monitoring

import schedule
import time

def monitoring_job():
    sources = {
        "nyc_permits": "https://data.cityofnewyork.us/permits",
        "federal_infra": None
    }
    for name, url in sources.items():
        try:
            data = scrape_building_permits(url) if url else scrape_usaspending("infrastructure")
            changes = detect_changes(data, name)
            if changes:
                print(f"New entries in {name}: {len(changes)}")
        except Exception as e:
            print(f"Error {name}: {e}")

schedule.every(6).hours.do(monitoring_job)
while True:
    schedule.run_pending()
    time.sleep(60)

Proxy Considerations

Government sites vary in sophistication. Use ThorData residential proxies for sites blocking datacenter IPs. Monitor with ScrapeOps.

Conclusion

Infrastructure monitoring gives early market signals. With Python and ScraperAPI, you can track billions in spending as it happens.

DEV Community