Scraping Nonprofit Financial Data: Form 990 and IRS Records
Every nonprofit in the United States is required to file Form 990 with the IRS, disclosing executive compensation, revenue, expenses, and program activities. This data is public but notoriously difficult to access in bulk. Let's build a Python scraper to extract and analyze nonprofit financial data systematically.
Why Nonprofit Financial Data Matters
Donors, journalists, and researchers need to evaluate whether nonprofits use funds effectively. Form 990 reveals executive pay ratios, fundraising efficiency, program spending percentages, and financial health indicators.
Data Sources
- IRS Exempt Organizations BMF — master list of all tax-exempt organizations
- ProPublica Nonprofit Explorer API — structured 990 data (best free source)
- IRS 990 XML files — raw filings on AWS (bulk download)
- State charity registrations — additional compliance data
ProPublica Nonprofit Explorer API
import requests
import time
class NonprofitExplorer:
BASE_URL = "https://projects.propublica.org/nonprofits/api/v2"
def search_organizations(self, query, state=None, page=0):
params = {"q": query, "page": page}
if state:
params["state[id]"] = state
response = requests.get(f"{self.BASE_URL}/search.json", params=params)
time.sleep(0.5)
return response.json().get("organizations", [])
def get_organization(self, ein):
response = requests.get(f"{self.BASE_URL}/organizations/{ein}.json")
time.sleep(0.5)
return response.json().get("organization", {})
IRS Bulk XML Data
import xml.etree.ElementTree as ET
class IRS990XMLParser:
AWS_INDEX_URL = "https://s3.amazonaws.com/irs-form-990"
def get_filing_index(self, year):
url = f"{self.AWS_INDEX_URL}/index_{year}.json"
response = requests.get(url)
return response.json().get(f"Filings{year}", [])
def parse_990_xml(self, xml_url):
response = requests.get(xml_url)
root = ET.fromstring(response.content)
ns = root.tag.split("}")[0] + "}" if "}" in root.tag else ""
data = {
"ein": self._find_text(root, f".//{ns}EIN"),
"name": self._find_text(root, f".//{ns}BusinessName/{ns}BusinessNameLine1Txt"),
"tax_year": self._find_text(root, f".//{ns}TaxYr"),
"total_revenue": self._find_number(root, f".//{ns}TotalRevenueAmt"),
"total_expenses": self._find_number(root, f".//{ns}TotalExpensesAmt"),
"net_assets": self._find_number(root, f".//{ns}NetAssetsOrFundBalancesEOYAmt"),
"program_service_expenses": self._find_number(root, f".//{ns}TotalProgramServiceExpensesAmt"),
}
data["officers"] = self._extract_officers(root, ns)
return data
def _find_text(self, root, path):
el = root.find(path)
return el.text if el is not None else None
def _find_number(self, root, path):
text = self._find_text(root, path)
try:
return float(text) if text else 0
except ValueError:
return 0
def _extract_officers(self, root, ns):
officers = []
for comp in root.findall(f".//{ns}OfficerDirectorTrusteeEmplGrp"):
name = self._find_text(comp, f"{ns}PersonNm")
title = self._find_text(comp, f"{ns}TitleTxt")
compensation = self._find_number(comp, f"{ns}ReportableCompFromOrgAmt")
if name:
officers.append({"name": name, "title": title, "compensation": compensation})
return officers
Financial Health Analysis
import pandas as pd
class NonprofitAnalyzer:
def calculate_metrics(self, filing_data):
revenue = filing_data.get("total_revenue", 0)
expenses = filing_data.get("total_expenses", 0)
program_expenses = filing_data.get("program_service_expenses", 0)
net_assets = filing_data.get("net_assets", 0)
metrics = {
"ein": filing_data.get("ein"),
"name": filing_data.get("name"),
"revenue": revenue,
"expenses": expenses,
}
if expenses > 0:
metrics["program_ratio"] = round(program_expenses / expenses * 100, 1)
else:
metrics["program_ratio"] = 0
monthly_expenses = expenses / 12
if monthly_expenses > 0:
metrics["operating_reserve_months"] = round(net_assets / monthly_expenses, 1)
else:
metrics["operating_reserve_months"] = 0
officers = filing_data.get("officers", [])
if officers:
top_comp = max(o["compensation"] for o in officers)
metrics["top_executive_comp"] = top_comp
if revenue > 0:
metrics["comp_to_revenue_pct"] = round(top_comp / revenue * 100, 2)
metrics["program_rating"] = (
"EXCELLENT" if metrics["program_ratio"] > 80
else "GOOD" if metrics["program_ratio"] > 65
else "FAIR" if metrics["program_ratio"] > 50
else "POOR"
)
return metrics
def compare_organizations(self, filings_list):
results = [self.calculate_metrics(f) for f in filings_list]
df = pd.DataFrame(results)
return df.sort_values("program_ratio", ascending=False)
Batch Processing
def batch_analyze_sector(sector_keyword, state=None, limit=500):
explorer = NonprofitExplorer()
analyzer = NonprofitAnalyzer()
orgs = []
page = 0
while len(orgs) < limit:
results = explorer.search_organizations(sector_keyword, state=state, page=page)
if not results:
break
orgs.extend(results)
page += 1
all_metrics = []
for org in orgs[:limit]:
ein = org.get("ein")
details = explorer.get_organization(ein)
filings = details.get("filings_with_data", [])
if filings:
latest = filings[0]
metrics = analyzer.calculate_metrics({
"ein": ein,
"name": org.get("name"),
"total_revenue": latest.get("totrevenue", 0),
"total_expenses": latest.get("totfuncexpns", 0),
"program_service_expenses": latest.get("totprgmrevnue", 0),
"net_assets": latest.get("totassetsend", 0),
"officers": []
})
all_metrics.append(metrics)
df = pd.DataFrame(all_metrics)
df.to_csv(f"nonprofit_{sector_keyword}_{state or 'all'}.csv", index=False)
return df
Scaling with Proxies
For large-scale scraping of state charity registrar sites, use ScraperAPI for rendering-heavy portals. ThorData residential proxies avoid rate limiting on government sites. ScrapeOps monitors scraper health.
Use Cases
- Donor due diligence — vet charities before giving
- Investigative journalism — find compensation outliers or financial red flags
- Academic research — study nonprofit sector trends at scale
- Grant makers — evaluate applicant financial health
Nonprofit financial transparency shouldn't require a forensic accountant. With these tools, anyone can analyze how organizations spend their funding.
Top comments (0)