Scraping OpenStreetMap: Geographic and POI Data at Scale
OpenStreetMap (OSM) is the world's largest open geographic database — a Wikipedia for maps. With data on roads, buildings, and POIs for the entire planet, it's invaluable for location-based apps. Completely free and open.
Overpass API
import requests, json, time
from datetime import datetime
class OSMScraper:
OVERPASS = "http://overpass-api.de/api/interpreter"
NOMINATIM = "https://nominatim.openstreetmap.org"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'OSMCollector/1.0'})
def query(self, q, timeout=60):
return self.session.post(self.OVERPASS, data={'data': q}, timeout=timeout).json()
def pois_in_bbox(self, bbox, amenity):
s, w, n, e = bbox
q = f'''[out:json][timeout:60];
(node["amenity"="{amenity}"]({s},{w},{n},{e});
way["amenity"="{amenity}"]({s},{w},{n},{e}););
out center body;'''
return self.query(q)
def city_amenity(self, city, amenity):
q = f'''[out:json][timeout:120];
area["name"="{city}"]->.a;
(node["amenity"="{amenity}"](area.a);
way["amenity"="{amenity}"](area.a););
out center body;'''
result = self.query(q)
return self._parse(result)
def _parse(self, result):
pois = []
for el in result.get('elements', []):
poi = {'id': el['id'], 'type': el['type'],
'tags': el.get('tags', {}),
'name': el.get('tags', {}).get('name', 'Unnamed')}
if el['type'] == 'node':
poi['lat'], poi['lon'] = el['lat'], el['lon']
elif 'center' in el:
poi['lat'] = el['center']['lat']
poi['lon'] = el['center']['lon']
pois.append(poi)
return pois
Geocoding with Nominatim
def geocode(self, query, limit=5):
resp = self.session.get(f"{self.NOMINATIM}/search",
params={'q': query, 'format': 'json', 'limit': limit, 'addressdetails': 1})
time.sleep(1) # Required: 1 req/sec
return resp.json()
def reverse_geocode(self, lat, lon):
resp = self.session.get(f"{self.NOMINATIM}/reverse",
params={'lat': lat, 'lon': lon, 'format': 'json', 'addressdetails': 1})
time.sleep(1)
return resp.json()
Bulk POI Collection
class BulkCollector:
def __init__(self, scraper):
self.scraper = scraper
def collect_amenities(self, city, types):
all_pois = {}
for t in types:
print(f"Collecting {t} in {city}...")
pois = self.scraper.city_amenity(city, t)
all_pois[t] = pois
print(f" Found {len(pois)}")
time.sleep(5)
return all_pois
def city_profile(self, city):
types = ['restaurant','cafe','bar','hospital','school',
'university','pharmacy','bank','fuel','library']
data = self.collect_amenities(city, types)
summary = {'city': city,
'total': sum(len(v) for v in data.values()),
'breakdown': {k: len(v) for k,v in data.items()}}
return summary, data
Export to GeoJSON
def to_geojson(pois, output='pois.geojson'):
features = []
for p in pois:
if 'lat' not in p: continue
features.append({
'type': 'Feature',
'geometry': {'type': 'Point', 'coordinates': [p['lon'], p['lat']]},
'properties': {'name': p['name'], 'osm_id': p['id'], **p.get('tags',{})}
})
gj = {'type': 'FeatureCollection', 'features': features}
with open(output, 'w') as f:
json.dump(gj, f, indent=2)
print(f"Exported {len(features)} POIs to {output}")
scraper = OSMScraper()
restaurants = scraper.city_amenity("Berlin", "restaurant")
to_geojson(restaurants, 'berlin_restaurants.geojson')
OSM's API is free but rate-limited. For supplementary scraping, ScraperAPI handles anti-bot measures. ThorData provides proxy rotation. Monitor with ScrapeOps.
Follow for more Python GIS tutorials.
Top comments (0)