How to Scrape Yelp: Businesses, Reviews, and Ratings
Yelp data is invaluable for market research, lead generation, and competitive analysis. This guide shows you how to extract business listings, reviews, and ratings programmatically.
Use Cases
- Lead generation: Find businesses in specific categories and locations
- Reputation monitoring: Track reviews for your business or competitors
- Market analysis: Compare ratings and review counts across categories
- Sentiment analysis: Analyze customer feedback at scale
Setup
pip install requests beautifulsoup4 pandas
Business Search Scraper
import requests
from bs4 import BeautifulSoup
import json
import time
import re
class YelpScraper:
def __init__(self, api_key=None):
self.session = requests.Session()
self.proxy_key = api_key
self.base_url = "https://www.yelp.com"
def fetch(self, url):
if self.proxy_key:
proxy_url = f"http://api.scraperapi.com?api_key={self.proxy_key}&url={url}"
return self.session.get(proxy_url, timeout=30)
return self.session.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
})
def search_businesses(self, category, location, max_pages=3):
all_businesses = []
for page in range(max_pages):
start = page * 10
url = f"{self.base_url}/search?find_desc={category}&find_loc={location}&start={start}"
response = self.fetch(url)
if response.status_code != 200:
break
businesses = self.parse_search(response.text)
if not businesses:
break
all_businesses.extend(businesses)
time.sleep(2)
return all_businesses
def parse_search(self, html):
soup = BeautifulSoup(html, "html.parser")
businesses = []
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
try:
data = json.loads(script.string)
if isinstance(data, list):
for item in data:
if item.get("@type") == "LocalBusiness":
businesses.append({
"name": item.get("name", ""),
"rating": item.get("aggregateRating", {}).get("ratingValue", 0),
"reviews": item.get("aggregateRating", {}).get("reviewCount", 0),
"address": item.get("address", {}).get("streetAddress", ""),
"phone": item.get("telephone", "")
})
except (json.JSONDecodeError, TypeError):
continue
return businesses
Review Scraper
class ReviewScraper(YelpScraper):
def get_reviews(self, business_url, max_pages=5):
all_reviews = []
for page in range(max_pages):
start = page * 10
url = f"{business_url}?start={start}&sort_by=date_desc"
response = self.fetch(url)
reviews = self.parse_reviews(response.text)
if not reviews:
break
all_reviews.extend(reviews)
time.sleep(2)
return all_reviews
def parse_reviews(self, html):
soup = BeautifulSoup(html, "html.parser")
reviews = []
review_blocks = soup.select("[data-testid*=review]")
for block in review_blocks:
text_el = block.select_one("p[lang]")
rating_el = block.select_one("[aria-label*=star]")
if text_el:
rating = 0
if rating_el:
match = re.search(r"([0-9.]+)", rating_el.get("aria-label", ""))
if match:
rating = float(match.group(1))
reviews.append({
"text": text_el.text.strip(),
"rating": rating
})
return reviews
Data Analysis
import pandas as pd
scraper = YelpScraper()
businesses = scraper.search_businesses("restaurants", "San Francisco, CA")
df = pd.DataFrame(businesses)
print(f"Found {len(df)} businesses")
print(f"\nTop rated:")
print(df.nlargest(5, "rating")[["name", "rating", "reviews"]])
print(f"\nMost reviewed:")
print(df.nlargest(5, "reviews")[["name", "rating", "reviews"]])
df.to_csv("yelp_data.csv", index=False)
Handling Anti-Scraping
Yelp actively blocks bots. Solutions:
- ScraperAPI - handles proxy rotation and CAPTCHAs automatically
- ThorData - residential proxies for realistic traffic
- ScrapeOps - monitor scraper health and success rates
Ethical Guidelines
- Respect rate limits and robots.txt
- Consider the Yelp Fusion API for authorized access
- Do not scrape personal user data
- Use collected data responsibly
Conclusion
Yelp scraping enables powerful local business intelligence. Combine it with sentiment analysis for deeper insights into customer satisfaction trends.
Follow for more web scraping tutorials!
Top comments (0)