How to Scrape Yelp: Businesses, Reviews, and Ratings

#python #tutorial #webdev #programming

How to Scrape Yelp: Businesses, Reviews, and Ratings

Yelp data is invaluable for market research, lead generation, and competitive analysis. This guide shows you how to extract business listings, reviews, and ratings programmatically.

Use Cases

Lead generation: Find businesses in specific categories and locations
Reputation monitoring: Track reviews for your business or competitors
Market analysis: Compare ratings and review counts across categories
Sentiment analysis: Analyze customer feedback at scale

Setup

pip install requests beautifulsoup4 pandas

Business Search Scraper

import requests
from bs4 import BeautifulSoup
import json
import time
import re

class YelpScraper:
    def __init__(self, api_key=None):
        self.session = requests.Session()
        self.proxy_key = api_key
        self.base_url = "https://www.yelp.com"

    def fetch(self, url):
        if self.proxy_key:
            proxy_url = f"http://api.scraperapi.com?api_key={self.proxy_key}&url={url}"
            return self.session.get(proxy_url, timeout=30)
        return self.session.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
        })

    def search_businesses(self, category, location, max_pages=3):
        all_businesses = []
        for page in range(max_pages):
            start = page * 10
            url = f"{self.base_url}/search?find_desc={category}&find_loc={location}&start={start}"
            response = self.fetch(url)
            if response.status_code != 200:
                break
            businesses = self.parse_search(response.text)
            if not businesses:
                break
            all_businesses.extend(businesses)
            time.sleep(2)
        return all_businesses

    def parse_search(self, html):
        soup = BeautifulSoup(html, "html.parser")
        businesses = []
        scripts = soup.find_all("script", type="application/ld+json")
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    for item in data:
                        if item.get("@type") == "LocalBusiness":
                            businesses.append({
                                "name": item.get("name", ""),
                                "rating": item.get("aggregateRating", {}).get("ratingValue", 0),
                                "reviews": item.get("aggregateRating", {}).get("reviewCount", 0),
                                "address": item.get("address", {}).get("streetAddress", ""),
                                "phone": item.get("telephone", "")
                            })
            except (json.JSONDecodeError, TypeError):
                continue
        return businesses

Review Scraper

class ReviewScraper(YelpScraper):
    def get_reviews(self, business_url, max_pages=5):
        all_reviews = []
        for page in range(max_pages):
            start = page * 10
            url = f"{business_url}?start={start}&sort_by=date_desc"
            response = self.fetch(url)
            reviews = self.parse_reviews(response.text)
            if not reviews:
                break
            all_reviews.extend(reviews)
            time.sleep(2)
        return all_reviews

    def parse_reviews(self, html):
        soup = BeautifulSoup(html, "html.parser")
        reviews = []
        review_blocks = soup.select("[data-testid*=review]")
        for block in review_blocks:
            text_el = block.select_one("p[lang]")
            rating_el = block.select_one("[aria-label*=star]")
            if text_el:
                rating = 0
                if rating_el:
                    match = re.search(r"([0-9.]+)", rating_el.get("aria-label", ""))
                    if match:
                        rating = float(match.group(1))
                reviews.append({
                    "text": text_el.text.strip(),
                    "rating": rating
                })
        return reviews

Data Analysis

import pandas as pd

scraper = YelpScraper()
businesses = scraper.search_businesses("restaurants", "San Francisco, CA")
df = pd.DataFrame(businesses)

print(f"Found {len(df)} businesses")
print(f"\nTop rated:")
print(df.nlargest(5, "rating")[["name", "rating", "reviews"]])
print(f"\nMost reviewed:")
print(df.nlargest(5, "reviews")[["name", "rating", "reviews"]])

df.to_csv("yelp_data.csv", index=False)

Handling Anti-Scraping

Yelp actively blocks bots. Solutions:

ScraperAPI - handles proxy rotation and CAPTCHAs automatically
ThorData - residential proxies for realistic traffic
ScrapeOps - monitor scraper health and success rates

Ethical Guidelines

Respect rate limits and robots.txt
Consider the Yelp Fusion API for authorized access
Do not scrape personal user data
Use collected data responsibly

Conclusion

Yelp scraping enables powerful local business intelligence. Combine it with sentiment analysis for deeper insights into customer satisfaction trends.

Follow for more web scraping tutorials!

DEV Community

How to Scrape Yelp: Businesses, Reviews, and Ratings

How to Scrape Yelp: Businesses, Reviews, and Ratings

Use Cases

Setup

Business Search Scraper

Review Scraper

Data Analysis

Handling Anti-Scraping

Ethical Guidelines

Conclusion

Top comments (0)