Regulatory enforcement data is gold for compliance teams, fintech companies, and investigative journalists. The SEC, FINRA, and CFTC all publish enforcement actions publicly, but their websites make bulk analysis nearly impossible.
Let me show you how to build scrapers for each regulator and create a unified enforcement database.
Why Scrape Enforcement Actions?
- Compliance teams need to screen counterparties against enforcement histories
- Fintech companies build risk scoring models from regulatory patterns
- Journalists investigate enforcement trends and regulatory capture
- Investors want to know if a broker or advisor has been sanctioned
This data is public but scattered across three different websites with different formats.
Setting Up
pip install requests beautifulsoup4 pandas lxml
Scraping SEC Enforcement Actions
The SEC publishes litigation releases and administrative proceedings at sec.gov:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import re
SCRAPER_API_KEY = "YOUR_KEY" # Get from scraperapi.com
def scrape_sec_actions(year: int = 2026, max_pages: int = 10) -> list[dict]:
"""Scrape SEC litigation releases for a given year."""
actions = []
base_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=LIT&dateb=&owner=include&count=40&search_text=&action=getcompany"
# Use ScraperAPI for reliable access
for page in range(max_pages):
params = {
"api_key": SCRAPER_API_KEY,
"url": f"https://www.sec.gov/litigation/litreleases/{year}sched.htm"
}
response = requests.get(
"https://api.scraperapi.com",
params=params,
timeout=60
)
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("table tr")[1:] # Skip header
for row in rows:
cells = row.select("td")
if len(cells) >= 3:
action = {
"regulator": "SEC",
"release_number": cells[0].get_text(strip=True),
"date": cells[1].get_text(strip=True),
"description": cells[2].get_text(strip=True),
"url": cells[0].select_one("a")["href"] if cells[0].select_one("a") else "",
"scraped_at": datetime.utcnow().isoformat()
}
actions.append(action)
time.sleep(2)
return actions
Scraping FINRA BrokerCheck
FINRA provides broker disciplinary data through BrokerCheck:
def scrape_finra_actions(max_results: int = 100) -> list[dict]:
"""Scrape FINRA disciplinary actions."""
actions = []
# FINRA has a public API for disciplinary actions
url = "https://api.brokercheck.finra.org/search/individual"
headers = {
"Accept": "application/json",
"User-Agent": "Mozilla/5.0 (research bot)"
}
offset = 0
batch_size = 20
while offset < max_results:
params = {
"query": "disclosureFlag:Y",
"filter": "action_type=disciplinary",
"hl": "true",
"nrows": batch_size,
"start": offset,
"sort": "bc_action_date desc"
}
response = requests.get(url, headers=headers, params=params, timeout=30)
if response.status_code != 200:
break
data = response.json()
hits = data.get("hits", {}).get("hits", [])
if not hits:
break
for hit in hits:
source = hit.get("_source", {})
actions.append({
"regulator": "FINRA",
"individual_name": source.get("ind_firstname", "") + " " + source.get("ind_lastname", ""),
"crd_number": source.get("ind_source_id"),
"firm": source.get("ind_current_employer"),
"disclosure_count": source.get("ind_disclosure_fl"),
"action_date": source.get("bc_action_date"),
"scraped_at": datetime.utcnow().isoformat()
})
offset += batch_size
time.sleep(3) # Respect rate limits
return actions
Scraping CFTC Enforcement
The CFTC publishes enforcement actions through press releases:
def scrape_cftc_actions(year: int = 2026) -> list[dict]:
"""Scrape CFTC enforcement actions from press releases."""
actions = []
params = {
"api_key": SCRAPER_API_KEY,
"url": f"https://www.cftc.gov/PressRoom/PressReleases?field_press_release_types_value=Enforcement&year={year}"
}
response = requests.get("https://api.scraperapi.com", params=params, timeout=60)
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.select(".views-row")
for article in articles:
title_el = article.select_one(".views-field-title a")
date_el = article.select_one(".views-field-created")
if title_el:
action = {
"regulator": "CFTC",
"title": title_el.get_text(strip=True),
"url": "https://www.cftc.gov" + title_el["href"],
"date": date_el.get_text(strip=True) if date_el else "",
"entities": extract_entities(title_el.get_text(strip=True)),
"scraped_at": datetime.utcnow().isoformat()
}
actions.append(action)
return actions
def extract_entities(title: str) -> list[str]:
"""Extract company/person names from enforcement action titles."""
patterns = [
r"against (.+?) for",
r"charges (.+?) with",
r"orders (.+?) to",
]
entities = []
for pattern in patterns:
matches = re.findall(pattern, title, re.IGNORECASE)
entities.extend(matches)
return entities
Building the Unified Database
def build_enforcement_database():
"""Combine all regulatory sources into one searchable database."""
print("Scraping SEC...")
sec = scrape_sec_actions(year=2026)
print("Scraping FINRA...")
finra = scrape_finra_actions(max_results=50)
print("Scraping CFTC...")
cftc = scrape_cftc_actions(year=2026)
# Normalize and combine
all_actions = sec + finra + cftc
df = pd.DataFrame(all_actions)
# Save to multiple formats
df.to_csv("enforcement_actions.csv", index=False)
df.to_json("enforcement_actions.json", orient="records", indent=2)
print(f"\nDatabase built: {len(df)} total actions")
print(f" SEC: {len(sec)} | FINRA: {len(finra)} | CFTC: {len(cftc)}")
return df
df = build_enforcement_database()
Scaling for Production
For daily monitoring across all three regulators, use ScraperAPI to handle proxy rotation and avoid IP blocks. Government sites can be particularly aggressive about blocking automated access.
ThorData residential proxies work well for FINRA BrokerCheck lookups where datacenter IPs get flagged. Track your pipeline reliability with ScrapeOps.
What You Can Build With This
- Compliance screening API — charge per lookup against your enforcement database
- Trend analysis dashboards — visualize enforcement patterns by year, type, and regulator
- Alert service — notify subscribers when new actions are filed against watched entities
- Risk scoring models — feed enforcement history into ML models for counterparty risk
Regulatory data scraping is a legitimate, high-value niche where most competitors charge thousands per year for access to the same public information.
Top comments (0)