Infrastructure Data is Public Gold
Governments publish massive amounts of infrastructure data: building permits, road projects, zoning changes. This data drives investment decisions worth billions.
What to Monitor
- Building permits signal new construction and growth
- Zoning changes hint at major developments
- Transportation projects indicate expansion corridors
- Federal spending contracts reveal priorities
Setup
pip install requests beautifulsoup4 pandas schedule
Government sites are inconsistent. ScraperAPI handles legacy portal rendering quirks.
Scraping Building Permits
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
SCRAPER_API_KEY = "YOUR_KEY"
def scrape_building_permits(city_portal_url):
url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={city_portal_url}&render=true"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
permits = []
for row in soup.select("table.permits tr, .permit-row"):
cols = row.select("td")
if len(cols) >= 4:
permits.append({
"permit_number": cols[0].text.strip(),
"address": cols[1].text.strip(),
"type": cols[2].text.strip(),
"value": parse_currency(cols[3].text),
"scraped_at": datetime.now().isoformat()
})
return pd.DataFrame(permits)
def parse_currency(text):
match = re.search(r"[\d.]+", text.replace(",", ""))
return float(match.group()) if match else 0
Federal Infrastructure Spending via USASpending API
def scrape_usaspending(keyword, fiscal_year=2026):
url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
payload = {
"filters": {
"keywords": [keyword],
"time_period": [{"start_date": f"{fiscal_year}-01-01", "end_date": f"{fiscal_year}-12-31"}],
"award_type_codes": ["A", "B", "C", "D"]
},
"fields": ["Award ID", "Recipient Name", "Award Amount", "Description"],
"limit": 100,
"page": 1
}
response = requests.post(url, json=payload)
data = response.json()
contracts = []
for r in data.get("results", []):
contracts.append({
"award_id": r.get("Award ID"),
"recipient": r.get("Recipient Name"),
"amount": r.get("Award Amount"),
"description": r.get("Description")
})
return pd.DataFrame(contracts)
Change Detection Engine
import hashlib
import json
import os
STATE_FILE = "infra_state.json"
def detect_changes(current_data, source_name):
current_hash = hashlib.md5(current_data.to_json().encode()).hexdigest()
prev = load_state()
prev_hash = prev.get(source_name, {}).get("hash")
if prev_hash != current_hash:
prev_ids = set(prev.get(source_name, {}).get("ids", []))
curr_ids = set(current_data.iloc[:, 0].tolist())
new_entries = curr_ids - prev_ids
prev[source_name] = {"hash": current_hash, "ids": list(curr_ids)}
save_state(prev)
return list(new_entries)
return []
def load_state():
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {}
def save_state(state):
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
Scheduled Monitoring
import schedule
import time
def monitoring_job():
sources = {
"nyc_permits": "https://data.cityofnewyork.us/permits",
"federal_infra": None
}
for name, url in sources.items():
try:
data = scrape_building_permits(url) if url else scrape_usaspending("infrastructure")
changes = detect_changes(data, name)
if changes:
print(f"New entries in {name}: {len(changes)}")
except Exception as e:
print(f"Error {name}: {e}")
schedule.every(6).hours.do(monitoring_job)
while True:
schedule.run_pending()
time.sleep(60)
Proxy Considerations
Government sites vary in sophistication. Use ThorData residential proxies for sites blocking datacenter IPs. Monitor with ScrapeOps.
Conclusion
Infrastructure monitoring gives early market signals. With Python and ScraperAPI, you can track billions in spending as it happens.
Top comments (0)