DEV Community

agenthustler
agenthustler

Posted on

Scraping Nonprofit Financial Data: Form 990 and IRS Records

Scraping Nonprofit Financial Data: Form 990 and IRS Records

Every nonprofit in the United States is required to file Form 990 with the IRS, disclosing executive compensation, revenue, expenses, and program activities. This data is public but notoriously difficult to access in bulk. Let's build a Python scraper to extract and analyze nonprofit financial data systematically.

Why Nonprofit Financial Data Matters

Donors, journalists, and researchers need to evaluate whether nonprofits use funds effectively. Form 990 reveals executive pay ratios, fundraising efficiency, program spending percentages, and financial health indicators.

Data Sources

  • IRS Exempt Organizations BMF — master list of all tax-exempt organizations
  • ProPublica Nonprofit Explorer API — structured 990 data (best free source)
  • IRS 990 XML files — raw filings on AWS (bulk download)
  • State charity registrations — additional compliance data

ProPublica Nonprofit Explorer API

import requests
import time

class NonprofitExplorer:
    BASE_URL = "https://projects.propublica.org/nonprofits/api/v2"

    def search_organizations(self, query, state=None, page=0):
        params = {"q": query, "page": page}
        if state:
            params["state[id]"] = state
        response = requests.get(f"{self.BASE_URL}/search.json", params=params)
        time.sleep(0.5)
        return response.json().get("organizations", [])

    def get_organization(self, ein):
        response = requests.get(f"{self.BASE_URL}/organizations/{ein}.json")
        time.sleep(0.5)
        return response.json().get("organization", {})
Enter fullscreen mode Exit fullscreen mode

IRS Bulk XML Data

import xml.etree.ElementTree as ET

class IRS990XMLParser:
    AWS_INDEX_URL = "https://s3.amazonaws.com/irs-form-990"

    def get_filing_index(self, year):
        url = f"{self.AWS_INDEX_URL}/index_{year}.json"
        response = requests.get(url)
        return response.json().get(f"Filings{year}", [])

    def parse_990_xml(self, xml_url):
        response = requests.get(xml_url)
        root = ET.fromstring(response.content)
        ns = root.tag.split("}")[0] + "}" if "}" in root.tag else ""

        data = {
            "ein": self._find_text(root, f".//{ns}EIN"),
            "name": self._find_text(root, f".//{ns}BusinessName/{ns}BusinessNameLine1Txt"),
            "tax_year": self._find_text(root, f".//{ns}TaxYr"),
            "total_revenue": self._find_number(root, f".//{ns}TotalRevenueAmt"),
            "total_expenses": self._find_number(root, f".//{ns}TotalExpensesAmt"),
            "net_assets": self._find_number(root, f".//{ns}NetAssetsOrFundBalancesEOYAmt"),
            "program_service_expenses": self._find_number(root, f".//{ns}TotalProgramServiceExpensesAmt"),
        }
        data["officers"] = self._extract_officers(root, ns)
        return data

    def _find_text(self, root, path):
        el = root.find(path)
        return el.text if el is not None else None

    def _find_number(self, root, path):
        text = self._find_text(root, path)
        try:
            return float(text) if text else 0
        except ValueError:
            return 0

    def _extract_officers(self, root, ns):
        officers = []
        for comp in root.findall(f".//{ns}OfficerDirectorTrusteeEmplGrp"):
            name = self._find_text(comp, f"{ns}PersonNm")
            title = self._find_text(comp, f"{ns}TitleTxt")
            compensation = self._find_number(comp, f"{ns}ReportableCompFromOrgAmt")
            if name:
                officers.append({"name": name, "title": title, "compensation": compensation})
        return officers
Enter fullscreen mode Exit fullscreen mode

Financial Health Analysis

import pandas as pd

class NonprofitAnalyzer:
    def calculate_metrics(self, filing_data):
        revenue = filing_data.get("total_revenue", 0)
        expenses = filing_data.get("total_expenses", 0)
        program_expenses = filing_data.get("program_service_expenses", 0)
        net_assets = filing_data.get("net_assets", 0)

        metrics = {
            "ein": filing_data.get("ein"),
            "name": filing_data.get("name"),
            "revenue": revenue,
            "expenses": expenses,
        }

        if expenses > 0:
            metrics["program_ratio"] = round(program_expenses / expenses * 100, 1)
        else:
            metrics["program_ratio"] = 0

        monthly_expenses = expenses / 12
        if monthly_expenses > 0:
            metrics["operating_reserve_months"] = round(net_assets / monthly_expenses, 1)
        else:
            metrics["operating_reserve_months"] = 0

        officers = filing_data.get("officers", [])
        if officers:
            top_comp = max(o["compensation"] for o in officers)
            metrics["top_executive_comp"] = top_comp
            if revenue > 0:
                metrics["comp_to_revenue_pct"] = round(top_comp / revenue * 100, 2)

        metrics["program_rating"] = (
            "EXCELLENT" if metrics["program_ratio"] > 80
            else "GOOD" if metrics["program_ratio"] > 65
            else "FAIR" if metrics["program_ratio"] > 50
            else "POOR"
        )
        return metrics

    def compare_organizations(self, filings_list):
        results = [self.calculate_metrics(f) for f in filings_list]
        df = pd.DataFrame(results)
        return df.sort_values("program_ratio", ascending=False)
Enter fullscreen mode Exit fullscreen mode

Batch Processing

def batch_analyze_sector(sector_keyword, state=None, limit=500):
    explorer = NonprofitExplorer()
    analyzer = NonprofitAnalyzer()
    orgs = []
    page = 0
    while len(orgs) < limit:
        results = explorer.search_organizations(sector_keyword, state=state, page=page)
        if not results:
            break
        orgs.extend(results)
        page += 1

    all_metrics = []
    for org in orgs[:limit]:
        ein = org.get("ein")
        details = explorer.get_organization(ein)
        filings = details.get("filings_with_data", [])
        if filings:
            latest = filings[0]
            metrics = analyzer.calculate_metrics({
                "ein": ein,
                "name": org.get("name"),
                "total_revenue": latest.get("totrevenue", 0),
                "total_expenses": latest.get("totfuncexpns", 0),
                "program_service_expenses": latest.get("totprgmrevnue", 0),
                "net_assets": latest.get("totassetsend", 0),
                "officers": []
            })
            all_metrics.append(metrics)

    df = pd.DataFrame(all_metrics)
    df.to_csv(f"nonprofit_{sector_keyword}_{state or 'all'}.csv", index=False)
    return df
Enter fullscreen mode Exit fullscreen mode

Scaling with Proxies

For large-scale scraping of state charity registrar sites, use ScraperAPI for rendering-heavy portals. ThorData residential proxies avoid rate limiting on government sites. ScrapeOps monitors scraper health.

Use Cases

  • Donor due diligence — vet charities before giving
  • Investigative journalism — find compensation outliers or financial red flags
  • Academic research — study nonprofit sector trends at scale
  • Grant makers — evaluate applicant financial health

Nonprofit financial transparency shouldn't require a forensic accountant. With these tools, anyone can analyze how organizations spend their funding.

Top comments (0)