Scraping Regulatory Enforcement Actions: FINRA, CFTC, and SEC Data

#python #tutorial #webdev #programming

Regulatory enforcement data is gold for compliance teams, fintech companies, and investigative journalists. The SEC, FINRA, and CFTC all publish enforcement actions publicly, but their websites make bulk analysis nearly impossible.

Let me show you how to build scrapers for each regulator and create a unified enforcement database.

Why Scrape Enforcement Actions?

Compliance teams need to screen counterparties against enforcement histories
Fintech companies build risk scoring models from regulatory patterns
Journalists investigate enforcement trends and regulatory capture
Investors want to know if a broker or advisor has been sanctioned

This data is public but scattered across three different websites with different formats.

Setting Up

pip install requests beautifulsoup4 pandas lxml

Scraping SEC Enforcement Actions

The SEC publishes litigation releases and administrative proceedings at sec.gov:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import re

SCRAPER_API_KEY = "YOUR_KEY"  # Get from scraperapi.com

def scrape_sec_actions(year: int = 2026, max_pages: int = 10) -> list[dict]:
    """Scrape SEC litigation releases for a given year."""
    actions = []
    base_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=LIT&dateb=&owner=include&count=40&search_text=&action=getcompany"

    # Use ScraperAPI for reliable access
    for page in range(max_pages):
        params = {
            "api_key": SCRAPER_API_KEY,
            "url": f"https://www.sec.gov/litigation/litreleases/{year}sched.htm"
        }

        response = requests.get(
            "https://api.scraperapi.com",
            params=params,
            timeout=60
        )
        soup = BeautifulSoup(response.text, "html.parser")

        rows = soup.select("table tr")[1:]  # Skip header

        for row in rows:
            cells = row.select("td")
            if len(cells) >= 3:
                action = {
                    "regulator": "SEC",
                    "release_number": cells[0].get_text(strip=True),
                    "date": cells[1].get_text(strip=True),
                    "description": cells[2].get_text(strip=True),
                    "url": cells[0].select_one("a")["href"] if cells[0].select_one("a") else "",
                    "scraped_at": datetime.utcnow().isoformat()
                }
                actions.append(action)

        time.sleep(2)

    return actions

Scraping FINRA BrokerCheck

FINRA provides broker disciplinary data through BrokerCheck:

def scrape_finra_actions(max_results: int = 100) -> list[dict]:
    """Scrape FINRA disciplinary actions."""
    actions = []

    # FINRA has a public API for disciplinary actions
    url = "https://api.brokercheck.finra.org/search/individual"

    headers = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0 (research bot)"
    }

    offset = 0
    batch_size = 20

    while offset < max_results:
        params = {
            "query": "disclosureFlag:Y",
            "filter": "action_type=disciplinary",
            "hl": "true",
            "nrows": batch_size,
            "start": offset,
            "sort": "bc_action_date desc"
        }

        response = requests.get(url, headers=headers, params=params, timeout=30)

        if response.status_code != 200:
            break

        data = response.json()
        hits = data.get("hits", {}).get("hits", [])

        if not hits:
            break

        for hit in hits:
            source = hit.get("_source", {})
            actions.append({
                "regulator": "FINRA",
                "individual_name": source.get("ind_firstname", "") + " " + source.get("ind_lastname", ""),
                "crd_number": source.get("ind_source_id"),
                "firm": source.get("ind_current_employer"),
                "disclosure_count": source.get("ind_disclosure_fl"),
                "action_date": source.get("bc_action_date"),
                "scraped_at": datetime.utcnow().isoformat()
            })

        offset += batch_size
        time.sleep(3)  # Respect rate limits

    return actions

Scraping CFTC Enforcement

The CFTC publishes enforcement actions through press releases:

def scrape_cftc_actions(year: int = 2026) -> list[dict]:
    """Scrape CFTC enforcement actions from press releases."""
    actions = []

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": f"https://www.cftc.gov/PressRoom/PressReleases?field_press_release_types_value=Enforcement&year={year}"
    }

    response = requests.get("https://api.scraperapi.com", params=params, timeout=60)
    soup = BeautifulSoup(response.text, "html.parser")

    articles = soup.select(".views-row")

    for article in articles:
        title_el = article.select_one(".views-field-title a")
        date_el = article.select_one(".views-field-created")

        if title_el:
            action = {
                "regulator": "CFTC",
                "title": title_el.get_text(strip=True),
                "url": "https://www.cftc.gov" + title_el["href"],
                "date": date_el.get_text(strip=True) if date_el else "",
                "entities": extract_entities(title_el.get_text(strip=True)),
                "scraped_at": datetime.utcnow().isoformat()
            }
            actions.append(action)

    return actions

def extract_entities(title: str) -> list[str]:
    """Extract company/person names from enforcement action titles."""
    patterns = [
        r"against (.+?) for",
        r"charges (.+?) with",
        r"orders (.+?) to",
    ]
    entities = []
    for pattern in patterns:
        matches = re.findall(pattern, title, re.IGNORECASE)
        entities.extend(matches)
    return entities

Building the Unified Database

def build_enforcement_database():
    """Combine all regulatory sources into one searchable database."""

    print("Scraping SEC...")
    sec = scrape_sec_actions(year=2026)

    print("Scraping FINRA...")
    finra = scrape_finra_actions(max_results=50)

    print("Scraping CFTC...")
    cftc = scrape_cftc_actions(year=2026)

    # Normalize and combine
    all_actions = sec + finra + cftc
    df = pd.DataFrame(all_actions)

    # Save to multiple formats
    df.to_csv("enforcement_actions.csv", index=False)
    df.to_json("enforcement_actions.json", orient="records", indent=2)

    print(f"\nDatabase built: {len(df)} total actions")
    print(f"  SEC: {len(sec)} | FINRA: {len(finra)} | CFTC: {len(cftc)}")

    return df

df = build_enforcement_database()

Scaling for Production

For daily monitoring across all three regulators, use ScraperAPI to handle proxy rotation and avoid IP blocks. Government sites can be particularly aggressive about blocking automated access.

ThorData residential proxies work well for FINRA BrokerCheck lookups where datacenter IPs get flagged. Track your pipeline reliability with ScrapeOps.

What You Can Build With This

Compliance screening API — charge per lookup against your enforcement database
Trend analysis dashboards — visualize enforcement patterns by year, type, and regulator
Alert service — notify subscribers when new actions are filed against watched entities
Risk scoring models — feed enforcement history into ML models for counterparty risk

Regulatory data scraping is a legitimate, high-value niche where most competitors charge thousands per year for access to the same public information.