Scraping Scientific Data: Climate Datasets, NASA APIs, and NOAA

#python #tutorial #webdev #programming

Scraping Scientific Data: Climate Datasets, NASA APIs, and NOAA

Scientific data is increasingly open and accessible. NASA, NOAA, and other agencies provide APIs and datasets that you can programmatically access for research, visualization, or building climate-aware applications.

NASA APIs

NASA offers over 30 free APIs. No scraping needed — just an API key:

pip install requests pandas matplotlib

Astronomy Picture of the Day

import requests

NASA_KEY = "DEMO_KEY"  # Get your own at api.nasa.gov

def get_apod(date=None):
    url = "https://api.nasa.gov/planetary/apod"
    params = {"api_key": NASA_KEY}
    if date:
        params["date"] = date

    response = requests.get(url, params=params)
    data = response.json()
    return {
        "title": data["title"],
        "date": data["date"],
        "explanation": data["explanation"],
        "image_url": data.get("hdurl", data.get("url"))
    }

apod = get_apod()
print(f"{apod['title']}\n{apod['explanation'][:200]}...")

Near Earth Objects (Asteroids)

def get_near_earth_objects(start_date, end_date):
    url = "https://api.nasa.gov/neo/rest/v1/feed"
    params = {
        "start_date": start_date,
        "end_date": end_date,
        "api_key": NASA_KEY
    }
    response = requests.get(url, params=params)
    data = response.json()

    asteroids = []
    for date, objects in data["near_earth_objects"].items():
        for obj in objects:
            asteroids.append({
                "name": obj["name"],
                "date": date,
                "diameter_min_m": obj["estimated_diameter"]["meters"]["estimated_diameter_min"],
                "diameter_max_m": obj["estimated_diameter"]["meters"]["estimated_diameter_max"],
                "velocity_kmh": float(obj["close_approach_data"][0]["relative_velocity"]["kilometers_per_hour"]),
                "miss_distance_km": float(obj["close_approach_data"][0]["miss_distance"]["kilometers"]),
                "is_hazardous": obj["is_potentially_hazardous_asteroid"]
            })

    return asteroids

neos = get_near_earth_objects("2026-03-01", "2026-03-07")
print(f"Found {len(neos)} near-Earth objects")
hazardous = [a for a in neos if a["is_hazardous"]]
print(f"Potentially hazardous: {len(hazardous)}")

NOAA Climate Data

def get_noaa_data(dataset_id, location_id, start_date, end_date, token):
    url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
    headers = {"token": token}
    params = {
        "datasetid": dataset_id,
        "locationid": location_id,
        "startdate": start_date,
        "enddate": end_date,
        "limit": 1000,
        "units": "metric"
    }

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json().get("results", [])
    return []

data = get_noaa_data(
    dataset_id="GHCND",
    location_id="CITY:US360019",
    start_date="2025-01-01",
    end_date="2025-12-31",
    token="YOUR_NOAA_TOKEN"
)

EPA Air Quality Data

def get_air_quality(zip_code, date):
    url = "https://www.airnowapi.org/aq/observation/zipCode/current/"
    params = {
        "format": "application/json",
        "zipCode": zip_code,
        "date": date,
        "distance": 25,
        "API_KEY": "YOUR_AIRNOW_KEY"
    }
    response = requests.get(url, params=params)
    return response.json()

aqi_data = get_air_quality("10001", "2026-03-01")
for reading in aqi_data:
    print(f"{reading['ParameterName']}: AQI {reading['AQI']} ({reading['Category']['Name']})")

Building a Climate Dashboard

import pandas as pd
import matplotlib.pyplot as plt

def plot_temperature_trend(data):
    df = pd.DataFrame(data)
    temp_data = df[df["datatype"] == "TAVG"]

    if temp_data.empty:
        print("No temperature data found")
        return

    temp_data["date"] = pd.to_datetime(temp_data["date"])
    temp_data["value"] = temp_data["value"] / 10

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(temp_data["date"], temp_data["value"], linewidth=0.8)
    ax.set_xlabel("Date")
    ax.set_ylabel("Temperature (C)")
    ax.set_title("Daily Average Temperature")
    ax.grid(True, alpha=0.3)
    plt.savefig("climate_trend.png", dpi=150, bbox_inches="tight")
    print("Chart saved to climate_trend.png")

Scraping Non-API Sources

Some scientific data requires scraping HTML pages. For those, use ScraperAPI to handle JavaScript-rendered pages from data portals. ThorData proxies help when accessing geo-restricted datasets. ScrapeOps monitors your collection pipeline health.

Data Sources Reference

Source	Type	Auth	URL
NASA APIs	REST API	Free key	api.nasa.gov
NOAA CDO	REST API	Free token	ncdc.noaa.gov
EPA AirNow	REST API	Free key	airnowapi.org
USGS Earthquakes	GeoJSON	None	earthquake.usgs.gov
OpenWeather	REST API	Free tier	openweathermap.org

Tips

Cache aggressively — scientific data rarely changes retroactively
Use bulk downloads for historical data instead of API calls
Check data licenses — most government data is public domain
Handle missing values — sensor outages create gaps in climate data
Use netCDF/HDF5 for large gridded datasets, not CSV

Follow for more Python data science tutorials!

DEV Community

Scraping Scientific Data: Climate Datasets, NASA APIs, and NOAA