Local business data is the backbone of hyperlocal SEO strategies. Whether you're building a local directory, analyzing competitor presence, or helping small businesses improve their visibility, programmatic access to business listings gives you a massive edge.
In this tutorial, I'll show you how to scrape local business data from public directories using Python, structure it for SEO analysis, and identify gaps in local market coverage.
Why Hyperlocal SEO Data Matters
Hyperlocal SEO targets users searching within a specific neighborhood or city block. Think "coffee shop near Union Square" rather than "best coffee shops." The businesses that win these searches have:
- Consistent NAP (Name, Address, Phone) across directories
- Rich category metadata
- Reviews and ratings signals
- Accurate hours and service descriptions
Scraping this data lets you audit consistency, find unclaimed listings, and spot opportunities competitors miss.
Setting Up the Scraper
First, install dependencies:
pip install requests beautifulsoup4 pandas
We'll use ScraperAPI to handle proxies and anti-bot measures, which is essential for directory sites that aggressively block scrapers.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
SCRAPER_API_KEY = "YOUR_SCRAPERAPI_KEY"
BASE_URL = "https://api.scraperapi.com"
def scrape_business_listing(url: str) -> dict:
"""Scrape a single business listing page."""
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get(BASE_URL, params=params, timeout=60)
soup = BeautifulSoup(response.text, "html.parser")
business = {
"name": extract_text(soup, "h1.business-name"),
"address": extract_text(soup, ".street-address"),
"city": extract_text(soup, ".locality"),
"phone": extract_text(soup, ".phone"),
"category": extract_text(soup, ".category"),
"rating": extract_text(soup, ".rating-value"),
"review_count": extract_text(soup, ".review-count"),
"hours": extract_text(soup, ".business-hours"),
"website": extract_attr(soup, "a.website-link", "href"),
"source_url": url
}
return business
def extract_text(soup, selector: str) -> str:
el = soup.select_one(selector)
return el.get_text(strip=True) if el else ""
def extract_attr(soup, selector: str, attr: str) -> str:
el = soup.select_one(selector)
return el.get(attr, "") if el else ""
Crawling Category Pages
Most directories organize businesses by category and location. Here's how to crawl through listing pages:
def scrape_category(category: str, location: str, max_pages: int = 5) -> list:
"""Scrape all businesses in a category for a location."""
businesses = []
for page in range(1, max_pages + 1):
search_url = f"https://example-directory.com/search?q={category}&loc={location}&page={page}"
params = {
"api_key": SCRAPER_API_KEY,
"url": search_url
}
response = requests.get(BASE_URL, params=params, timeout=60)
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.select(".business-card")
if not listings:
break
for listing in listings:
detail_url = listing.select_one("a")["href"]
business = scrape_business_listing(detail_url)
businesses.append(business)
time.sleep(1) # Be respectful
print(f"Page {page}: found {len(listings)} listings")
time.sleep(2)
return businesses
NAP Consistency Analysis
Once you have data from multiple directories, cross-reference it to find inconsistencies:
def analyze_nap_consistency(datasets: dict[str, pd.DataFrame]) -> pd.DataFrame:
"""Compare NAP data across multiple directory sources."""
issues = []
# Group by business name (fuzzy match in production)
for name in datasets[list(datasets.keys())[0]]["name"].unique():
records = {}
for source, df in datasets.items():
match = df[df["name"].str.contains(name, case=False, na=False)]
if not match.empty:
records[source] = match.iloc[0]
if len(records) < 2:
continue
sources = list(records.keys())
for i in range(len(sources)):
for j in range(i + 1, len(sources)):
r1, r2 = records[sources[i]], records[sources[j]]
for field in ["address", "phone"]:
if r1[field] != r2[field] and r1[field] and r2[field]:
issues.append({
"business": name,
"field": field,
f"{sources[i]}_value": r1[field],
f"{sources[j]}_value": r2[field]
})
return pd.DataFrame(issues)
Generating SEO Reports
Finally, generate actionable reports:
def generate_seo_report(businesses: list[dict], location: str):
df = pd.DataFrame(businesses)
report = {
"location": location,
"total_businesses": len(df),
"with_website": df["website"].astype(bool).sum(),
"without_website": (~df["website"].astype(bool)).sum(),
"avg_rating": df["rating"].replace("", None).dropna().astype(float).mean(),
"categories": df["category"].value_counts().to_dict(),
"missing_hours": (df["hours"] == "").sum()
}
print(f"\n=== Hyperlocal SEO Report: {location} ===")
print(f"Total businesses: {report['total_businesses']}")
print(f"Without website ({report['without_website']}): SEO opportunity!")
print(f"Missing hours ({report['missing_hours']}): Incomplete listings")
return report
# Run the full pipeline
businesses = scrape_category("restaurants", "brooklyn-ny", max_pages=3)
report = generate_seo_report(businesses, "Brooklyn, NY")
Scaling with Proxy Infrastructure
For production scraping across multiple cities, you need reliable proxy rotation. ScraperAPI handles this automatically with geo-targeted proxies. For residential proxy needs, ThorData offers competitive pricing on rotating residential IPs.
To monitor your scraping jobs and track success rates, ScrapeOps provides dashboards that show you exactly where your scrapers are failing.
Key Takeaways
- Hyperlocal business data reveals SEO gaps that generic tools miss
- NAP consistency audits across directories are high-value services
- Always use proxy rotation for directory scraping at scale
- Businesses without websites represent direct outreach opportunities
Local SEO consulting powered by scraped data is a real revenue stream. The businesses you find without websites? Those are your first clients.
Top comments (0)