Healthcare data is increasingly available through public APIs and government databases. By accessing CMS data, clinical trials, and FDA drug approvals, you can build powerful health analytics tools for research and business intelligence.
Why Scrape Healthcare Data?
- Track clinical trial trends and outcomes
- Monitor FDA drug approvals and safety alerts
- Analyze CMS hospital quality metrics
- Build health tech and research applications
Setting Up
pip install requests pandas
ClinicalTrials.gov API
ClinicalTrials.gov provides a free, well-documented API:
import requests
import pandas as pd
def search_clinical_trials(condition, status="RECRUITING", max_results=50):
url = "https://clinicaltrials.gov/api/v2/studies"
params = {
"query.cond": condition,
"filter.overallStatus": status,
"pageSize": max_results,
"fields": "NCTId,BriefTitle,OverallStatus,StartDate,Condition"
}
response = requests.get(url, params=params)
data = response.json()
trials = []
for study in data.get("studies", []):
protocol = study.get("protocolSection", {})
id_mod = protocol.get("identificationModule", {})
status_mod = protocol.get("statusModule", {})
trials.append({
"nct_id": id_mod.get("nctId", ""),
"title": id_mod.get("briefTitle", ""),
"status": status_mod.get("overallStatus", ""),
"start_date": status_mod.get("startDateStruct", {}).get("date", "")
})
return trials
trials = search_clinical_trials("diabetes")
print(f"Found {len(trials)} recruiting trials")
for t in trials[:5]:
print(f" {t['nct_id']}: {t['title']}")
FDA Drug Approvals (openFDA)
def search_drug_approvals(drug_name, limit=10):
url = "https://api.fda.gov/drug/drugsfda.json"
params = {
"search": f'openfda.brand_name:"{drug_name}"',
"limit": limit
}
response = requests.get(url, params=params)
data = response.json()
approvals = []
for result in data.get("results", []):
for submission in result.get("submissions", []):
approvals.append({
"brand_name": result.get("openfda", {}).get("brand_name", ["N/A"])[0],
"manufacturer": result.get("openfda", {}).get("manufacturer_name", ["N/A"])[0],
"submission_type": submission.get("submission_type", ""),
"submission_status": submission.get("submission_status", "")
})
return approvals
approvals = search_drug_approvals("Ozempic")
for a in approvals[:3]:
print(f"{a['brand_name']} - {a['submission_type']} - {a['submission_status']}")
Drug Adverse Events
def search_adverse_events(drug_name, limit=10):
url = "https://api.fda.gov/drug/event.json"
params = {
"search": f'patient.drug.medicinalproduct:"{drug_name}"',
"limit": limit
}
response = requests.get(url, params=params)
data = response.json()
events = []
for result in data.get("results", []):
reactions = [r["reactionmeddrapt"] for r in result.get("patient", {}).get("reaction", [])]
events.append({
"drug": drug_name,
"reactions": reactions,
"serious": result.get("serious", 0),
"date": result.get("receivedate", "")
})
return events
events = search_adverse_events("metformin")
print(f"Found {len(events)} adverse event reports")
CMS Hospital Data
def get_cms_data(dataset_id, limit=100):
url = f"https://data.cms.gov/provider-data/api/1/datastore/query/{dataset_id}"
params = {"limit": limit, "offset": 0}
response = requests.get(url, params=params)
return response.json().get("results", [])
Healthcare Dashboard
import sqlite3
from datetime import datetime
def init_health_db():
conn = sqlite3.connect("healthcare.db")
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS trial_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
condition TEXT, trial_count INTEGER,
snapshot_date TEXT
)
""")
conn.commit()
return conn
def snapshot_trials(conn, condition):
trials = search_clinical_trials(condition)
cursor = conn.cursor()
cursor.execute(
"INSERT INTO trial_snapshots (condition, trial_count, snapshot_date) VALUES (?, ?, ?)",
(condition, len(trials), datetime.now().strftime("%Y-%m-%d"))
)
conn.commit()
conn = init_health_db()
for condition in ["diabetes", "cancer", "alzheimer"]:
snapshot_trials(conn, condition)
Scraping Additional Sources
For healthcare sites requiring JavaScript rendering, use ScraperAPI. For large-scale collection, ThorData provides residential proxies and ScrapeOps offers pipeline monitoring.
Conclusion
Healthcare data scraping opens valuable insights for researchers and health tech builders. These public APIs provide a wealth of information for evidence-based decision making.
Happy building! Follow for more health tech tutorials.
Top comments (0)