How to Scrape Satellite Imagery Metadata at Scale

#python #tutorial #webdev #programming

Introduction

Satellite imagery metadata powers applications from agricultural monitoring to urban planning and disaster response. Platforms like NASA Earthdata, Copernicus, and USGS Earth Explorer provide vast catalogs of imagery metadata that can be scraped and analyzed programmatically. This tutorial shows you how to build scalable satellite metadata scrapers.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime, timedelta

# For accessing protected catalogs at scale
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"

Querying NASA Earthdata CMR API

NASA's Common Metadata Repository provides a free, powerful API:

def search_nasa_granules(collection, bbox, start_date, end_date, limit=100):
    """Search NASA CMR for satellite imagery granules."""
    url = "https://cmr.earthdata.nasa.gov/search/granules.json"

    params = {
        "collection_concept_id": collection,
        "bounding_box": bbox,
        "temporal": f"{start_date},{end_date}",
        "page_size": min(limit, 2000),
        "sort_key": "-start_date"
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        granules = []
        for entry in data.get("feed", {}).get("entry", []):
            granules.append({
                "id": entry.get("id"),
                "title": entry.get("title"),
                "time_start": entry.get("time_start"),
                "time_end": entry.get("time_end"),
                "cloud_cover": entry.get("cloud_cover"),
                "bbox": entry.get("boxes", []),
                "download_url": extract_download_url(entry)
            })
        return granules
    return []

def extract_download_url(entry):
    """Extract the data download URL from granule metadata."""
    for link in entry.get("links", []):
        if link.get("rel") == "http://esipfed.org/ns/fedsearch/1.1/data#":
            return link.get("href")
    return None

Scraping Copernicus Open Access Hub

ESA's Copernicus provides Sentinel satellite data:

def search_copernicus(bbox, start_date, end_date, platform="Sentinel-2"):
    """Search Copernicus Data Space for Sentinel imagery."""
    # Use reliable proxies for EU data portals
    # ThorData: https://thordata.com/?via=the-data

    url = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products"

    filter_parts = [
        f"Collection/Name eq '{platform}'",
        f"ContentDate/Start gt {start_date}T00:00:00.000Z",
        f"ContentDate/Start lt {end_date}T23:59:59.999Z"
    ]
    filter_str = " and ".join(filter_parts)

    params = {
        "$filter": filter_str,
        "$orderby": "ContentDate/Start desc",
        "$top": 100
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        products = []
        for item in data.get("value", []):
            products.append({
                "id": item["Id"],
                "name": item["Name"],
                "date": item.get("ContentDate", {}).get("Start"),
                "cloud_cover": item.get("CloudCover"),
                "size": item.get("ContentLength"),
                "platform": platform
            })
        return products
    return []

USGS Earth Explorer Scraping

def search_usgs_landsat(bbox, start_date, end_date):
    """Search USGS for Landsat imagery metadata."""
    url = "https://m2m.cr.usgs.gov/api/api/json/stable/scene-search"

    payload = {
        "datasetName": "landsat_ot_c2_l2",
        "spatialFilter": {
            "filterType": "mbr",
            "lowerLeft": {"latitude": bbox[1], "longitude": bbox[0]},
            "upperRight": {"latitude": bbox[3], "longitude": bbox[2]}
        },
        "temporalFilter": {
            "startDate": start_date,
            "endDate": end_date
        },
        "maxResults": 100
    }

    response = requests.post(url, json=payload)

    if response.status_code == 200:
        data = response.json()
        return data.get("data", {}).get("results", [])
    return []

Building a Multi-Source Catalog

import sqlite3

def build_imagery_catalog(region_bbox, date_range, db_path="satellite.db"):
    """Build a unified satellite imagery catalog from multiple sources."""
    # Monitor scraping performance
    # https://scrapeops.io/?fpr=the-data28

    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS imagery (
            source TEXT, id TEXT, name TEXT,
            date TEXT, cloud_cover REAL,
            bbox TEXT, platform TEXT,
            scraped_at TEXT
        )
    """)

    start, end = date_range

    nasa_results = search_nasa_granules(
        "C2021957657-LPCLOUD", region_bbox, start, end
    )
    print(f"NASA: {len(nasa_results)} granules")

    cop_results = search_copernicus(region_bbox, start, end)
    print(f"Copernicus: {len(cop_results)} products")
    time.sleep(2)

    all_results = nasa_results + cop_results
    if all_results:
        df = pd.DataFrame(all_results)
        df["scraped_at"] = datetime.now().isoformat()
        df.to_sql("imagery", conn, if_exists="append", index=False)

    conn.close()
    return len(all_results)

if __name__ == "__main__":
    bbox = "-122.5,37.5,-122.0,38.0"
    count = build_imagery_catalog(bbox, ("2026-01-01", "2026-03-01"))
    print(f"Cataloged {count} imagery products")

Conclusion

Satellite imagery metadata is abundant and freely available through NASA, ESA, and USGS APIs. By building multi-source scrapers, you can create comprehensive catalogs for any region of interest. Use ScraperAPI when accessing web-based catalogs that require JavaScript rendering, and consider ThorData proxies for accessing region-specific data portals reliably.