DEV Community

agenthustler
agenthustler

Posted on

Scraping Government Data: Public Records, APIs, and FOIA Data

Government websites are treasure troves of public data. From business registrations to court filings, this data is public by law, making it one of the most legitimate scraping targets.

Scraping Open Data Portals

Many portals use CKAN or Socrata APIs:

import requests
import pandas as pd

class GovDataScraper:
    def search_data_gov(self, query, rows=50):
        url = 'https://catalog.data.gov/api/3/action/package_search'
        resp = requests.get(url, params={'q': query, 'rows': rows})
        data = resp.json()
        datasets = []
        for r in data['result']['results']:
            resources = [
                {'url': res['url'], 'format': res.get('format', 'N/A')}
                for res in r.get('resources', [])
                if res.get('format', '').upper() in ('CSV', 'JSON', 'XML')
            ]
            datasets.append({
                'title': r['title'],
                'org': r.get('organization', {}).get('title', 'N/A'),
                'resources': resources
            })
        return datasets

    def download_csv(self, url):
        return pd.read_csv(url)
Enter fullscreen mode Exit fullscreen mode

Scraping Court Records

from bs4 import BeautifulSoup

class CourtScraper:
    def search(self, query):
        url = 'https://www.courtlistener.com/api/rest/v4/search/'
        resp = requests.get(url, params={'q': query, 'type': 'o', 'format': 'json'})
        return [{
            'case': c.get('caseName', ''),
            'court': c.get('court', ''),
            'date': c.get('dateFiled', ''),
            'snippet': c.get('snippet', '')
        } for c in resp.json().get('results', [])]
Enter fullscreen mode Exit fullscreen mode

Scraping Property Records

class PropertyScraper:
    def __init__(self, county_url):
        self.base_url = county_url
        self.session = requests.Session()

    def search(self, address):
        resp = self.session.get(f"{self.base_url}/search", params={'address': address})
        soup = BeautifulSoup(resp.text, 'html.parser')
        results = []
        for row in soup.select('table.results tr')[1:]:
            cols = row.select('td')
            if len(cols) >= 4:
                results.append({
                    'parcel': cols[0].get_text(strip=True),
                    'address': cols[1].get_text(strip=True),
                    'owner': cols[2].get_text(strip=True),
                    'value': cols[3].get_text(strip=True)
                })
        return results
Enter fullscreen mode Exit fullscreen mode

Processing FOIA Documents

import json
from pathlib import Path

class FOIAProcessor:
    def process(self, directory):
        docs = []
        for fp in Path(directory).glob('**/*'):
            if fp.suffix == '.csv':
                df = pd.read_csv(fp)
                docs.append({'file': str(fp), 'rows': len(df), 'cols': list(df.columns)})
            elif fp.suffix == '.json':
                with open(fp) as f:
                    data = json.load(f)
                docs.append({'file': str(fp), 'keys': list(data.keys()) if isinstance(data, dict) else len(data)})
        return docs
Enter fullscreen mode Exit fullscreen mode

Best Practices

  • Government data is public by law
  • Respect rate limits on government servers
  • Cache aggressively — data changes slowly
  • Use APIs when available

For large-scale collection, ScraperAPI handles technical challenges. ThorData provides reliable proxy infrastructure, and ScrapeOps monitors your pipelines.

Conclusion

Government data scraping is valuable and legally clear. Python gives you all the tools to automate public records collection and analysis.

Top comments (0)