DEV Community

agenthustler
agenthustler

Posted on

How to Scrape FDA Warning Letters and Enforcement Records with Python

The FDA publishes thousands of warning letters and enforcement actions. Scraping and analyzing this data reveals compliance trends, common violations, and which industries face the most scrutiny.

Why FDA Data Matters

FDA warning letters are public records that signal regulatory trends. Pharmaceutical companies, food manufacturers, and supplement brands monitor these to avoid compliance pitfalls. Investors use them to assess risk.

Scraping the FDA Warning Letters Database

import requests
from bs4 import BeautifulSoup
import pandas as pd

API_KEY = "YOUR_SCRAPERAPI_KEY"  # Get one at https://www.scraperapi.com?fp_ref=the52

def scrape_fda_warnings(page=1):
    url = f"https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters?page={page}"
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    letters = []
    table = soup.select_one('table.views-table')
    if table:
        for row in table.select('tbody tr'):
            cells = row.select('td')
            if len(cells) >= 4:
                letters.append({
                    'company': cells[0].text.strip(),
                    'subject': cells[1].text.strip(),
                    'issuing_office': cells[2].text.strip(),
                    'date': cells[3].text.strip(),
                    'link': cells[0].select_one('a')['href'] if cells[0].select_one('a') else ''
                })
    return letters

all_letters = []
for page in range(1, 20):
    letters = scrape_fda_warnings(page=page)
    if not letters:
        break
    all_letters.extend(letters)
    print(f"Page {page}: {len(letters)} letters")

df = pd.DataFrame(all_letters)
df.to_csv('fda_warnings.csv', index=False)
print(f"Total: {len(df)} warning letters collected")
Enter fullscreen mode Exit fullscreen mode

Extracting Violation Details

def scrape_letter_detail(letter_url):
    base = "https://www.fda.gov"
    full_url = f"{base}{letter_url}" if not letter_url.startswith('http') else letter_url
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={full_url}"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    content = soup.select_one('.field--name-body')
    if content:
        text = content.get_text(separator='\n')
        violations = []
        for line in text.split('\n'):
            if any(kw in line.lower() for kw in ['violation', 'adulterated', 'misbranded', 'cgmp']):
                violations.append(line.strip())
        return violations
    return []

for idx, row in df.head(50).iterrows():
    if row['link']:
        violations = scrape_letter_detail(row['link'])
        df.at[idx, 'violations'] = '; '.join(violations[:5])
Enter fullscreen mode Exit fullscreen mode

Analyzing Violation Patterns

from collections import Counter

all_text = ' '.join(df['subject'].fillna('').tolist()).lower()
violation_types = ['cgmp', 'adulterated', 'misbranded', 'unapproved', 'dietary supplement', 'drug', 'device', 'food']

counts = {vtype: all_text.count(vtype) for vtype in violation_types}
violation_df = pd.DataFrame(list(counts.items()), columns=['type', 'count'])
print(violation_df.sort_values('count', ascending=False).to_string(index=False))
Enter fullscreen mode Exit fullscreen mode

Building a Monitoring System

Set up automated alerts. Use ThorData for proxy rotation and ScrapeOps to monitor uptime:

import sqlite3
import hashlib

conn = sqlite3.connect('fda_monitor.db')

def check_new_letters(keywords):
    new_letters = scrape_fda_warnings(page=1)
    alerts = []
    for letter in new_letters:
        letter_id = hashlib.md5(f"{letter['company']}{letter['date']}".encode()).hexdigest()
        existing = pd.read_sql(f"SELECT * FROM letters WHERE id='{letter_id}'", conn)
        if existing.empty:
            for kw in keywords:
                if kw.lower() in letter['subject'].lower():
                    alerts.append(letter)
    return alerts

alerts = check_new_letters(['supplement', 'cosmetic', 'cbd'])
print(f"{len(alerts)} new relevant warning letters found")
Enter fullscreen mode Exit fullscreen mode

Use Cases

  • Investors: Screen portfolio companies for regulatory risk
  • Compliance teams: Track violation trends in your industry
  • Journalists: Monitor enforcement patterns
  • ScraperAPI ensures reliable access to government sites

Top comments (0)