The FDA publishes thousands of warning letters and enforcement actions. Scraping and analyzing this data reveals compliance trends, common violations, and which industries face the most scrutiny.
Why FDA Data Matters
FDA warning letters are public records that signal regulatory trends. Pharmaceutical companies, food manufacturers, and supplement brands monitor these to avoid compliance pitfalls. Investors use them to assess risk.
Scraping the FDA Warning Letters Database
import requests
from bs4 import BeautifulSoup
import pandas as pd
API_KEY = "YOUR_SCRAPERAPI_KEY" # Get one at https://www.scraperapi.com?fp_ref=the52
def scrape_fda_warnings(page=1):
url = f"https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters?page={page}"
proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}"
response = requests.get(proxy_url, timeout=60)
soup = BeautifulSoup(response.text, 'html.parser')
letters = []
table = soup.select_one('table.views-table')
if table:
for row in table.select('tbody tr'):
cells = row.select('td')
if len(cells) >= 4:
letters.append({
'company': cells[0].text.strip(),
'subject': cells[1].text.strip(),
'issuing_office': cells[2].text.strip(),
'date': cells[3].text.strip(),
'link': cells[0].select_one('a')['href'] if cells[0].select_one('a') else ''
})
return letters
all_letters = []
for page in range(1, 20):
letters = scrape_fda_warnings(page=page)
if not letters:
break
all_letters.extend(letters)
print(f"Page {page}: {len(letters)} letters")
df = pd.DataFrame(all_letters)
df.to_csv('fda_warnings.csv', index=False)
print(f"Total: {len(df)} warning letters collected")
Extracting Violation Details
def scrape_letter_detail(letter_url):
base = "https://www.fda.gov"
full_url = f"{base}{letter_url}" if not letter_url.startswith('http') else letter_url
proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={full_url}"
response = requests.get(proxy_url, timeout=60)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.select_one('.field--name-body')
if content:
text = content.get_text(separator='\n')
violations = []
for line in text.split('\n'):
if any(kw in line.lower() for kw in ['violation', 'adulterated', 'misbranded', 'cgmp']):
violations.append(line.strip())
return violations
return []
for idx, row in df.head(50).iterrows():
if row['link']:
violations = scrape_letter_detail(row['link'])
df.at[idx, 'violations'] = '; '.join(violations[:5])
Analyzing Violation Patterns
from collections import Counter
all_text = ' '.join(df['subject'].fillna('').tolist()).lower()
violation_types = ['cgmp', 'adulterated', 'misbranded', 'unapproved', 'dietary supplement', 'drug', 'device', 'food']
counts = {vtype: all_text.count(vtype) for vtype in violation_types}
violation_df = pd.DataFrame(list(counts.items()), columns=['type', 'count'])
print(violation_df.sort_values('count', ascending=False).to_string(index=False))
Building a Monitoring System
Set up automated alerts. Use ThorData for proxy rotation and ScrapeOps to monitor uptime:
import sqlite3
import hashlib
conn = sqlite3.connect('fda_monitor.db')
def check_new_letters(keywords):
new_letters = scrape_fda_warnings(page=1)
alerts = []
for letter in new_letters:
letter_id = hashlib.md5(f"{letter['company']}{letter['date']}".encode()).hexdigest()
existing = pd.read_sql(f"SELECT * FROM letters WHERE id='{letter_id}'", conn)
if existing.empty:
for kw in keywords:
if kw.lower() in letter['subject'].lower():
alerts.append(letter)
return alerts
alerts = check_new_letters(['supplement', 'cosmetic', 'cbd'])
print(f"{len(alerts)} new relevant warning letters found")
Use Cases
- Investors: Screen portfolio companies for regulatory risk
- Compliance teams: Track violation trends in your industry
- Journalists: Monitor enforcement patterns
- ScraperAPI ensures reliable access to government sites
Top comments (0)