DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Satellite Data: Sentinel Hub and NASA Earthdata

Satellite imagery and Earth observation data are freely available from NASA and ESA. These datasets power everything from climate research to precision agriculture. Let's build Python tools to access and process this data.

Available Satellite Data Sources

  • Sentinel Hub (ESA): Sentinel-1 (radar), Sentinel-2 (optical), Sentinel-3 (ocean/land)
  • NASA Earthdata: Landsat, MODIS, VIIRS, GPM (precipitation)
  • NOAA: Weather satellites, ocean data
  • Planet (commercial): Daily global imagery

Setting Up

pip install requests rasterio numpy matplotlib sentinelhub
Enter fullscreen mode Exit fullscreen mode

Accessing NASA Earthdata

NASA's CMR (Common Metadata Repository) API lets you search their entire catalog:

import requests
import json

def search_earthdata(keyword, start_date, end_date, bbox=None):
    """Search NASA Earthdata for datasets."""
    url = "https://cmr.earthdata.nasa.gov/search/granules.json"
    params = {
        "keyword": keyword,
        "temporal[]": f"{start_date},{end_date}",
        "page_size": 50,
        "sort_key": "-start_date"
    }

    if bbox:  # [west, south, east, north]
        params["bounding_box[]"] = ",".join(map(str, bbox))

    resp = requests.get(url, params=params, timeout=30)
    data = resp.json()

    results = []
    for entry in data.get("feed", {}).get("entry", []):
        results.append({
            "title": entry.get("title", ""),
            "dataset": entry.get("dataset_id", ""),
            "time_start": entry.get("time_start", ""),
            "time_end": entry.get("time_end", ""),
            "links": [
                l["href"] for l in entry.get("links", [])
                if l.get("rel") == "http://esipfed.org/ns/fedsearch/1.1/data#"
            ]
        })
    return results

# Search for Landsat imagery over California
results = search_earthdata(
    "Landsat",
    "2025-01-01", "2025-03-01",
    bbox=[-124.48, 32.53, -114.13, 42.01]
)
print(f"Found {len(results)} granules")
Enter fullscreen mode Exit fullscreen mode

Downloading Sentinel-2 Data

from sentinelhub import (
    SHConfig, BBox, CRS, SentinelHubRequest,
    DataCollection, MimeType, bbox_to_dimensions
)

def download_sentinel2(bbox_coords, time_interval, resolution=10):
    """Download Sentinel-2 imagery for an area."""
    config = SHConfig()
    config.sh_client_id = "YOUR_CLIENT_ID"
    config.sh_client_secret = "YOUR_CLIENT_SECRET"

    bbox = BBox(bbox=bbox_coords, crs=CRS.WGS84)
    size = bbox_to_dimensions(bbox, resolution=resolution)

    evalscript = """
    //VERSION=3
    function setup() {
        return {
            input: [{ bands: ["B04", "B03", "B02", "B08"] }],
            output: { bands: 4 }
        };
    }
    function evaluatePixel(sample) {
        return [sample.B04, sample.B03, sample.B02, sample.B08];
    }
    """

    request = SentinelHubRequest(
        evalscript=evalscript,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L2A,
                time_interval=time_interval
            )
        ],
        responses=[
            SentinelHubRequest.output_response("default", MimeType.TIFF)
        ],
        bbox=bbox,
        size=size,
        config=config
    )

    data = request.get_data()
    return data[0]  # numpy array

# Download imagery for a farm area
image = download_sentinel2(
    bbox_coords=[-122.5, 37.7, -122.3, 37.9],
    time_interval=("2025-01-01", "2025-01-31"),
    resolution=10
)
print(f"Image shape: {image.shape}")
Enter fullscreen mode Exit fullscreen mode

Computing Vegetation Index (NDVI)

import numpy as np
import matplotlib.pyplot as plt

def compute_ndvi(sentinel_image):
    """Calculate NDVI from Sentinel-2 bands."""
    red = sentinel_image[:, :, 0].astype(float)
    nir = sentinel_image[:, :, 3].astype(float)

    ndvi = np.where(
        (nir + red) == 0, 0,
        (nir - red) / (nir + red)
    )
    return ndvi

def plot_ndvi(ndvi, title="NDVI Map"):
    """Visualize NDVI with color mapping."""
    fig, ax = plt.subplots(figsize=(10, 8))
    im = ax.imshow(ndvi, cmap="RdYlGn", vmin=-1, vmax=1)
    plt.colorbar(im, ax=ax, label="NDVI")
    ax.set_title(title)
    plt.savefig("ndvi_map.png", dpi=150, bbox_inches="tight")
    plt.show()

ndvi = compute_ndvi(image)
plot_ndvi(ndvi, "NDVI - San Francisco Bay Area")
Enter fullscreen mode Exit fullscreen mode

Time Series Analysis

Track vegetation health over time:

def ndvi_time_series(bbox_coords, start_date, end_date, interval_days=16):
    """Generate NDVI time series for an area."""
    from datetime import datetime, timedelta

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    current = start

    series = []
    while current < end:
        next_date = current + timedelta(days=interval_days)
        time_range = (
            current.strftime("%Y-%m-%d"),
            next_date.strftime("%Y-%m-%d")
        )

        try:
            image = download_sentinel2(bbox_coords, time_range)
            ndvi = compute_ndvi(image)
            series.append({
                "date": current.strftime("%Y-%m-%d"),
                "mean_ndvi": float(np.nanmean(ndvi)),
                "max_ndvi": float(np.nanmax(ndvi)),
                "min_ndvi": float(np.nanmin(ndvi))
            })
        except Exception as e:
            print(f"No data for {current.strftime('%Y-%m-%d')}: {e}")

        current = next_date

    return pd.DataFrame(series)
Enter fullscreen mode Exit fullscreen mode

Scraping Supplementary Data

Combine satellite data with scraped ground-truth data for better analysis. Use ScraperAPI to collect weather station data, crop reports, and market prices. ThorData helps when scraping agricultural data from international sources, and ScrapeOps monitors your data pipeline.

Practical Applications

def crop_health_report(farm_bbox, date_range):
    """Generate a crop health report for a farm."""
    image = download_sentinel2(farm_bbox, date_range)
    ndvi = compute_ndvi(image)

    total_pixels = ndvi.size
    healthy = np.sum(ndvi > 0.6) / total_pixels * 100
    stressed = np.sum((ndvi > 0.2) & (ndvi <= 0.6)) / total_pixels * 100
    bare = np.sum(ndvi <= 0.2) / total_pixels * 100

    report = {
        "healthy_vegetation": f"{healthy:.1f}%",
        "stressed_vegetation": f"{stressed:.1f}%",
        "bare_soil": f"{bare:.1f}%",
        "mean_ndvi": f"{np.nanmean(ndvi):.3f}",
        "assessment": "Good" if healthy > 70 else "Needs attention"
    }
    return report
Enter fullscreen mode Exit fullscreen mode

Conclusion

Satellite data scraping opens up powerful possibilities in agriculture, environmental monitoring, urban planning, and climate research. NASA and ESA provide free data. The challenge is processing it efficiently. Start with Sentinel-2 optical data (easiest to work with), master NDVI analysis, then expand to radar data and multi-temporal analysis.

The combination of satellite imagery and ground-truth web scraping creates datasets that would cost millions to collect manually.

Top comments (0)