How to Scrape FDA Warning Letters and Enforcement Records with Python

#python #tutorial #webdev #programming

The FDA publishes thousands of warning letters and enforcement actions. Scraping and analyzing this data reveals compliance trends, common violations, and which industries face the most scrutiny.

Why FDA Data Matters

FDA warning letters are public records that signal regulatory trends. Pharmaceutical companies, food manufacturers, and supplement brands monitor these to avoid compliance pitfalls. Investors use them to assess risk.

Scraping the FDA Warning Letters Database

import requests
from bs4 import BeautifulSoup
import pandas as pd

API_KEY = "YOUR_SCRAPERAPI_KEY"  # Get one at https://www.scraperapi.com?fp_ref=the52

def scrape_fda_warnings(page=1):
    url = f"https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters?page={page}"
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    letters = []
    table = soup.select_one('table.views-table')
    if table:
        for row in table.select('tbody tr'):
            cells = row.select('td')
            if len(cells) >= 4:
                letters.append({
                    'company': cells[0].text.strip(),
                    'subject': cells[1].text.strip(),
                    'issuing_office': cells[2].text.strip(),
                    'date': cells[3].text.strip(),
                    'link': cells[0].select_one('a')['href'] if cells[0].select_one('a') else ''
                })
    return letters

all_letters = []
for page in range(1, 20):
    letters = scrape_fda_warnings(page=page)
    if not letters:
        break
    all_letters.extend(letters)
    print(f"Page {page}: {len(letters)} letters")

df = pd.DataFrame(all_letters)
df.to_csv('fda_warnings.csv', index=False)
print(f"Total: {len(df)} warning letters collected")

Extracting Violation Details

def scrape_letter_detail(letter_url):
    base = "https://www.fda.gov"
    full_url = f"{base}{letter_url}" if not letter_url.startswith('http') else letter_url
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={full_url}"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = soup.select_one('.field--name-body')
    if content:
        text = content.get_text(separator='\n')
        violations = []
        for line in text.split('\n'):
            if any(kw in line.lower() for kw in ['violation', 'adulterated', 'misbranded', 'cgmp']):
                violations.append(line.strip())
        return violations
    return []

for idx, row in df.head(50).iterrows():
    if row['link']:
        violations = scrape_letter_detail(row['link'])
        df.at[idx, 'violations'] = '; '.join(violations[:5])

Analyzing Violation Patterns

from collections import Counter

all_text = ' '.join(df['subject'].fillna('').tolist()).lower()
violation_types = ['cgmp', 'adulterated', 'misbranded', 'unapproved', 'dietary supplement', 'drug', 'device', 'food']

counts = {vtype: all_text.count(vtype) for vtype in violation_types}
violation_df = pd.DataFrame(list(counts.items()), columns=['type', 'count'])
print(violation_df.sort_values('count', ascending=False).to_string(index=False))

Building a Monitoring System

Set up automated alerts. Use ThorData for proxy rotation and ScrapeOps to monitor uptime:

import sqlite3
import hashlib

conn = sqlite3.connect('fda_monitor.db')

def check_new_letters(keywords):
    new_letters = scrape_fda_warnings(page=1)
    alerts = []
    for letter in new_letters:
        letter_id = hashlib.md5(f"{letter['company']}{letter['date']}".encode()).hexdigest()
        existing = pd.read_sql(f"SELECT * FROM letters WHERE id='{letter_id}'", conn)
        if existing.empty:
            for kw in keywords:
                if kw.lower() in letter['subject'].lower():
                    alerts.append(letter)
    return alerts

alerts = check_new_letters(['supplement', 'cosmetic', 'cbd'])
print(f"{len(alerts)} new relevant warning letters found")