Introduction
Satellite imagery metadata powers applications from agricultural monitoring to urban planning and disaster response. Platforms like NASA Earthdata, Copernicus, and USGS Earth Explorer provide vast catalogs of imagery metadata that can be scraped and analyzed programmatically. This tutorial shows you how to build scalable satellite metadata scrapers.
Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime, timedelta
# For accessing protected catalogs at scale
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Querying NASA Earthdata CMR API
NASA's Common Metadata Repository provides a free, powerful API:
def search_nasa_granules(collection, bbox, start_date, end_date, limit=100):
"""Search NASA CMR for satellite imagery granules."""
url = "https://cmr.earthdata.nasa.gov/search/granules.json"
params = {
"collection_concept_id": collection,
"bounding_box": bbox,
"temporal": f"{start_date},{end_date}",
"page_size": min(limit, 2000),
"sort_key": "-start_date"
}
response = requests.get(url, params=params)
if response.status_code == 200:
data = response.json()
granules = []
for entry in data.get("feed", {}).get("entry", []):
granules.append({
"id": entry.get("id"),
"title": entry.get("title"),
"time_start": entry.get("time_start"),
"time_end": entry.get("time_end"),
"cloud_cover": entry.get("cloud_cover"),
"bbox": entry.get("boxes", []),
"download_url": extract_download_url(entry)
})
return granules
return []
def extract_download_url(entry):
"""Extract the data download URL from granule metadata."""
for link in entry.get("links", []):
if link.get("rel") == "http://esipfed.org/ns/fedsearch/1.1/data#":
return link.get("href")
return None
Scraping Copernicus Open Access Hub
ESA's Copernicus provides Sentinel satellite data:
def search_copernicus(bbox, start_date, end_date, platform="Sentinel-2"):
"""Search Copernicus Data Space for Sentinel imagery."""
# Use reliable proxies for EU data portals
# ThorData: https://thordata.com/?via=the-data
url = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products"
filter_parts = [
f"Collection/Name eq '{platform}'",
f"ContentDate/Start gt {start_date}T00:00:00.000Z",
f"ContentDate/Start lt {end_date}T23:59:59.999Z"
]
filter_str = " and ".join(filter_parts)
params = {
"$filter": filter_str,
"$orderby": "ContentDate/Start desc",
"$top": 100
}
response = requests.get(url, params=params)
if response.status_code == 200:
data = response.json()
products = []
for item in data.get("value", []):
products.append({
"id": item["Id"],
"name": item["Name"],
"date": item.get("ContentDate", {}).get("Start"),
"cloud_cover": item.get("CloudCover"),
"size": item.get("ContentLength"),
"platform": platform
})
return products
return []
USGS Earth Explorer Scraping
def search_usgs_landsat(bbox, start_date, end_date):
"""Search USGS for Landsat imagery metadata."""
url = "https://m2m.cr.usgs.gov/api/api/json/stable/scene-search"
payload = {
"datasetName": "landsat_ot_c2_l2",
"spatialFilter": {
"filterType": "mbr",
"lowerLeft": {"latitude": bbox[1], "longitude": bbox[0]},
"upperRight": {"latitude": bbox[3], "longitude": bbox[2]}
},
"temporalFilter": {
"startDate": start_date,
"endDate": end_date
},
"maxResults": 100
}
response = requests.post(url, json=payload)
if response.status_code == 200:
data = response.json()
return data.get("data", {}).get("results", [])
return []
Building a Multi-Source Catalog
import sqlite3
def build_imagery_catalog(region_bbox, date_range, db_path="satellite.db"):
"""Build a unified satellite imagery catalog from multiple sources."""
# Monitor scraping performance
# https://scrapeops.io/?fpr=the-data28
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS imagery (
source TEXT, id TEXT, name TEXT,
date TEXT, cloud_cover REAL,
bbox TEXT, platform TEXT,
scraped_at TEXT
)
""")
start, end = date_range
nasa_results = search_nasa_granules(
"C2021957657-LPCLOUD", region_bbox, start, end
)
print(f"NASA: {len(nasa_results)} granules")
cop_results = search_copernicus(region_bbox, start, end)
print(f"Copernicus: {len(cop_results)} products")
time.sleep(2)
all_results = nasa_results + cop_results
if all_results:
df = pd.DataFrame(all_results)
df["scraped_at"] = datetime.now().isoformat()
df.to_sql("imagery", conn, if_exists="append", index=False)
conn.close()
return len(all_results)
if __name__ == "__main__":
bbox = "-122.5,37.5,-122.0,38.0"
count = build_imagery_catalog(bbox, ("2026-01-01", "2026-03-01"))
print(f"Cataloged {count} imagery products")
Conclusion
Satellite imagery metadata is abundant and freely available through NASA, ESA, and USGS APIs. By building multi-source scrapers, you can create comprehensive catalogs for any region of interest. Use ScraperAPI when accessing web-based catalogs that require JavaScript rendering, and consider ThorData proxies for accessing region-specific data portals reliably.
Top comments (0)