Government websites are treasure troves of public data. From business registrations to court filings, this data is public by law, making it one of the most legitimate scraping targets.
Scraping Open Data Portals
Many portals use CKAN or Socrata APIs:
import requests
import pandas as pd
class GovDataScraper:
def search_data_gov(self, query, rows=50):
url = 'https://catalog.data.gov/api/3/action/package_search'
resp = requests.get(url, params={'q': query, 'rows': rows})
data = resp.json()
datasets = []
for r in data['result']['results']:
resources = [
{'url': res['url'], 'format': res.get('format', 'N/A')}
for res in r.get('resources', [])
if res.get('format', '').upper() in ('CSV', 'JSON', 'XML')
]
datasets.append({
'title': r['title'],
'org': r.get('organization', {}).get('title', 'N/A'),
'resources': resources
})
return datasets
def download_csv(self, url):
return pd.read_csv(url)
Scraping Court Records
from bs4 import BeautifulSoup
class CourtScraper:
def search(self, query):
url = 'https://www.courtlistener.com/api/rest/v4/search/'
resp = requests.get(url, params={'q': query, 'type': 'o', 'format': 'json'})
return [{
'case': c.get('caseName', ''),
'court': c.get('court', ''),
'date': c.get('dateFiled', ''),
'snippet': c.get('snippet', '')
} for c in resp.json().get('results', [])]
Scraping Property Records
class PropertyScraper:
def __init__(self, county_url):
self.base_url = county_url
self.session = requests.Session()
def search(self, address):
resp = self.session.get(f"{self.base_url}/search", params={'address': address})
soup = BeautifulSoup(resp.text, 'html.parser')
results = []
for row in soup.select('table.results tr')[1:]:
cols = row.select('td')
if len(cols) >= 4:
results.append({
'parcel': cols[0].get_text(strip=True),
'address': cols[1].get_text(strip=True),
'owner': cols[2].get_text(strip=True),
'value': cols[3].get_text(strip=True)
})
return results
Processing FOIA Documents
import json
from pathlib import Path
class FOIAProcessor:
def process(self, directory):
docs = []
for fp in Path(directory).glob('**/*'):
if fp.suffix == '.csv':
df = pd.read_csv(fp)
docs.append({'file': str(fp), 'rows': len(df), 'cols': list(df.columns)})
elif fp.suffix == '.json':
with open(fp) as f:
data = json.load(f)
docs.append({'file': str(fp), 'keys': list(data.keys()) if isinstance(data, dict) else len(data)})
return docs
Best Practices
- Government data is public by law
- Respect rate limits on government servers
- Cache aggressively — data changes slowly
- Use APIs when available
For large-scale collection, ScraperAPI handles technical challenges. ThorData provides reliable proxy infrastructure, and ScrapeOps monitors your pipelines.
Conclusion
Government data scraping is valuable and legally clear. Python gives you all the tools to automate public records collection and analysis.
Top comments (0)