DEV Community

agenthustler
agenthustler

Posted on

Scraping Healthcare Data: CMS, Clinical Trials, and Drug Approvals

Healthcare data is increasingly available through public APIs and government databases. By accessing CMS data, clinical trials, and FDA drug approvals, you can build powerful health analytics tools for research and business intelligence.

Why Scrape Healthcare Data?

  • Track clinical trial trends and outcomes
  • Monitor FDA drug approvals and safety alerts
  • Analyze CMS hospital quality metrics
  • Build health tech and research applications

Setting Up

pip install requests pandas
Enter fullscreen mode Exit fullscreen mode

ClinicalTrials.gov API

ClinicalTrials.gov provides a free, well-documented API:

import requests
import pandas as pd

def search_clinical_trials(condition, status="RECRUITING", max_results=50):
    url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "query.cond": condition,
        "filter.overallStatus": status,
        "pageSize": max_results,
        "fields": "NCTId,BriefTitle,OverallStatus,StartDate,Condition"
    }

    response = requests.get(url, params=params)
    data = response.json()

    trials = []
    for study in data.get("studies", []):
        protocol = study.get("protocolSection", {})
        id_mod = protocol.get("identificationModule", {})
        status_mod = protocol.get("statusModule", {})

        trials.append({
            "nct_id": id_mod.get("nctId", ""),
            "title": id_mod.get("briefTitle", ""),
            "status": status_mod.get("overallStatus", ""),
            "start_date": status_mod.get("startDateStruct", {}).get("date", "")
        })

    return trials

trials = search_clinical_trials("diabetes")
print(f"Found {len(trials)} recruiting trials")
for t in trials[:5]:
    print(f"  {t['nct_id']}: {t['title']}")
Enter fullscreen mode Exit fullscreen mode

FDA Drug Approvals (openFDA)

def search_drug_approvals(drug_name, limit=10):
    url = "https://api.fda.gov/drug/drugsfda.json"
    params = {
        "search": f'openfda.brand_name:"{drug_name}"',
        "limit": limit
    }

    response = requests.get(url, params=params)
    data = response.json()

    approvals = []
    for result in data.get("results", []):
        for submission in result.get("submissions", []):
            approvals.append({
                "brand_name": result.get("openfda", {}).get("brand_name", ["N/A"])[0],
                "manufacturer": result.get("openfda", {}).get("manufacturer_name", ["N/A"])[0],
                "submission_type": submission.get("submission_type", ""),
                "submission_status": submission.get("submission_status", "")
            })

    return approvals

approvals = search_drug_approvals("Ozempic")
for a in approvals[:3]:
    print(f"{a['brand_name']} - {a['submission_type']} - {a['submission_status']}")
Enter fullscreen mode Exit fullscreen mode

Drug Adverse Events

def search_adverse_events(drug_name, limit=10):
    url = "https://api.fda.gov/drug/event.json"
    params = {
        "search": f'patient.drug.medicinalproduct:"{drug_name}"',
        "limit": limit
    }

    response = requests.get(url, params=params)
    data = response.json()

    events = []
    for result in data.get("results", []):
        reactions = [r["reactionmeddrapt"] for r in result.get("patient", {}).get("reaction", [])]
        events.append({
            "drug": drug_name,
            "reactions": reactions,
            "serious": result.get("serious", 0),
            "date": result.get("receivedate", "")
        })

    return events

events = search_adverse_events("metformin")
print(f"Found {len(events)} adverse event reports")
Enter fullscreen mode Exit fullscreen mode

CMS Hospital Data

def get_cms_data(dataset_id, limit=100):
    url = f"https://data.cms.gov/provider-data/api/1/datastore/query/{dataset_id}"
    params = {"limit": limit, "offset": 0}
    response = requests.get(url, params=params)
    return response.json().get("results", [])
Enter fullscreen mode Exit fullscreen mode

Healthcare Dashboard

import sqlite3
from datetime import datetime

def init_health_db():
    conn = sqlite3.connect("healthcare.db")
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS trial_snapshots (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            condition TEXT, trial_count INTEGER,
            snapshot_date TEXT
        )
    """)
    conn.commit()
    return conn

def snapshot_trials(conn, condition):
    trials = search_clinical_trials(condition)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO trial_snapshots (condition, trial_count, snapshot_date) VALUES (?, ?, ?)",
        (condition, len(trials), datetime.now().strftime("%Y-%m-%d"))
    )
    conn.commit()

conn = init_health_db()
for condition in ["diabetes", "cancer", "alzheimer"]:
    snapshot_trials(conn, condition)
Enter fullscreen mode Exit fullscreen mode

Scraping Additional Sources

For healthcare sites requiring JavaScript rendering, use ScraperAPI. For large-scale collection, ThorData provides residential proxies and ScrapeOps offers pipeline monitoring.

Conclusion

Healthcare data scraping opens valuable insights for researchers and health tech builders. These public APIs provide a wealth of information for evidence-based decision making.


Happy building! Follow for more health tech tutorials.

Top comments (0)