DEV Community

agenthustler
agenthustler

Posted on

Scraping OpenStreetMap: Geographic and POI Data at Scale

Scraping OpenStreetMap: Geographic and POI Data at Scale

OpenStreetMap (OSM) is the world's largest open geographic database — a Wikipedia for maps. With data on roads, buildings, and POIs for the entire planet, it's invaluable for location-based apps. Completely free and open.

Overpass API

import requests, json, time
from datetime import datetime

class OSMScraper:
    OVERPASS = "http://overpass-api.de/api/interpreter"
    NOMINATIM = "https://nominatim.openstreetmap.org"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'OSMCollector/1.0'})

    def query(self, q, timeout=60):
        return self.session.post(self.OVERPASS, data={'data': q}, timeout=timeout).json()

    def pois_in_bbox(self, bbox, amenity):
        s, w, n, e = bbox
        q = f'''[out:json][timeout:60];
        (node["amenity"="{amenity}"]({s},{w},{n},{e});
         way["amenity"="{amenity}"]({s},{w},{n},{e}););
        out center body;'''
        return self.query(q)

    def city_amenity(self, city, amenity):
        q = f'''[out:json][timeout:120];
        area["name"="{city}"]->.a;
        (node["amenity"="{amenity}"](area.a);
         way["amenity"="{amenity}"](area.a););
        out center body;'''
        result = self.query(q)
        return self._parse(result)

    def _parse(self, result):
        pois = []
        for el in result.get('elements', []):
            poi = {'id': el['id'], 'type': el['type'],
                   'tags': el.get('tags', {}),
                   'name': el.get('tags', {}).get('name', 'Unnamed')}
            if el['type'] == 'node':
                poi['lat'], poi['lon'] = el['lat'], el['lon']
            elif 'center' in el:
                poi['lat'] = el['center']['lat']
                poi['lon'] = el['center']['lon']
            pois.append(poi)
        return pois
Enter fullscreen mode Exit fullscreen mode

Geocoding with Nominatim

    def geocode(self, query, limit=5):
        resp = self.session.get(f"{self.NOMINATIM}/search",
            params={'q': query, 'format': 'json', 'limit': limit, 'addressdetails': 1})
        time.sleep(1)  # Required: 1 req/sec
        return resp.json()

    def reverse_geocode(self, lat, lon):
        resp = self.session.get(f"{self.NOMINATIM}/reverse",
            params={'lat': lat, 'lon': lon, 'format': 'json', 'addressdetails': 1})
        time.sleep(1)
        return resp.json()
Enter fullscreen mode Exit fullscreen mode

Bulk POI Collection

class BulkCollector:
    def __init__(self, scraper):
        self.scraper = scraper

    def collect_amenities(self, city, types):
        all_pois = {}
        for t in types:
            print(f"Collecting {t} in {city}...")
            pois = self.scraper.city_amenity(city, t)
            all_pois[t] = pois
            print(f"  Found {len(pois)}")
            time.sleep(5)
        return all_pois

    def city_profile(self, city):
        types = ['restaurant','cafe','bar','hospital','school',
                 'university','pharmacy','bank','fuel','library']
        data = self.collect_amenities(city, types)
        summary = {'city': city,
                   'total': sum(len(v) for v in data.values()),
                   'breakdown': {k: len(v) for k,v in data.items()}}
        return summary, data
Enter fullscreen mode Exit fullscreen mode

Export to GeoJSON

def to_geojson(pois, output='pois.geojson'):
    features = []
    for p in pois:
        if 'lat' not in p: continue
        features.append({
            'type': 'Feature',
            'geometry': {'type': 'Point', 'coordinates': [p['lon'], p['lat']]},
            'properties': {'name': p['name'], 'osm_id': p['id'], **p.get('tags',{})}
        })
    gj = {'type': 'FeatureCollection', 'features': features}
    with open(output, 'w') as f:
        json.dump(gj, f, indent=2)
    print(f"Exported {len(features)} POIs to {output}")

scraper = OSMScraper()
restaurants = scraper.city_amenity("Berlin", "restaurant")
to_geojson(restaurants, 'berlin_restaurants.geojson')
Enter fullscreen mode Exit fullscreen mode

OSM's API is free but rate-limited. For supplementary scraping, ScraperAPI handles anti-bot measures. ThorData provides proxy rotation. Monitor with ScrapeOps.


Follow for more Python GIS tutorials.

Top comments (0)