DEV Community

agenthustler
agenthustler

Posted on

Zillow Real Estate Scraping: Property Data, Prices, and Estimates with Python

Zillow is the largest real estate marketplace in the US, with data on over 100 million properties. Extracting property data, Zestimates, and market trends enables powerful real estate analysis. Here's how to do it with Python in 2026.

What Data Can You Extract?

  • Property addresses, prices, and Zestimates
  • Square footage, bedrooms, bathrooms
  • Listing status (for sale, sold, pending)
  • Price history and tax records
  • Neighborhood statistics
  • Photos and virtual tour links

Using Zillow's Hidden API

Zillow loads data through internal API endpoints that return JSON. This is more reliable than parsing HTML:

import requests
import json

def search_zillow_api(location, page=1):
    """Search Zillow using their internal search API."""
    url = "https://www.zillow.com/search/GetSearchPageState.htm"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
        "Accept": "application/json",
        "Referer": f"https://www.zillow.com/{location.lower().replace(' ', '-')}/",
    }

    # Search parameters
    search_query = {
        "pagination": {"currentPage": page},
        "mapBounds": {
            "west": -122.5,
            "east": -122.3,
            "south": 37.7,
            "north": 37.85,
        },
        "filterState": {
            "isForSaleByAgent": {"value": True},
            "isForSaleByOwner": {"value": True},
            "isNewConstruction": {"value": False},
            "isForSaleForeclosure": {"value": False},
            "isComingSoon": {"value": False},
            "isAuction": {"value": False},
        },
    }

    params = {
        "searchQueryState": json.dumps(search_query),
        "wants": json.dumps({"cat1": ["listResults", "mapResults"]}),
        "requestId": 1,
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json()

    print(f"Error: {response.status_code}")
    return None
Enter fullscreen mode Exit fullscreen mode

Parsing Search Results

def parse_search_results(api_response):
    """Extract property listings from Zillow API response."""
    results = []

    if not api_response:
        return results

    search_results = (
        api_response
        .get("cat1", {})
        .get("searchResults", {})
        .get("listResults", [])
    )

    for listing in search_results:
        property_data = {
            "zpid": listing.get("zpid"),
            "address": listing.get("address"),
            "price": listing.get("price"),
            "unformatted_price": listing.get("unformattedPrice"),
            "beds": listing.get("beds"),
            "baths": listing.get("baths"),
            "area_sqft": listing.get("area"),
            "latitude": listing.get("latLong", {}).get("latitude"),
            "longitude": listing.get("latLong", {}).get("longitude"),
            "status": listing.get("statusText"),
            "listing_url": listing.get("detailUrl"),
            "broker": listing.get("brokerName"),
            "zestimate": listing.get("zestimate"),
            "price_per_sqft": None,
        }

        # Calculate price per sqft
        if property_data["unformatted_price"] and property_data["area_sqft"]:
            try:
                property_data["price_per_sqft"] = round(
                    property_data["unformatted_price"] / property_data["area_sqft"], 2
                )
            except (TypeError, ZeroDivisionError):
                pass

        results.append(property_data)

    return results
Enter fullscreen mode Exit fullscreen mode

Scraping Individual Property Pages

from playwright.sync_api import sync_playwright
import re

def scrape_property_details(zpid):
    """Scrape detailed property information from Zillow."""
    url = f"https://www.zillow.com/homedetails/{zpid}_zpid/"

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()

        page.goto(url, wait_until="networkidle")

        details = {}

        # Extract from Next.js data
        scripts = page.query_selector_all('script[type="application/json"]')
        for script in scripts:
            try:
                data = json.loads(script.inner_text())
                if isinstance(data, dict) and "props" in data:
                    page_props = data["props"].get("pageProps", {})
                    if "componentProps" in page_props:
                        details["raw_data"] = page_props["componentProps"]
                        break
            except json.JSONDecodeError:
                continue

        # Fallback: extract from visible elements
        price_el = page.query_selector('span[data-testid="price"]')
        details['price'] = price_el.inner_text() if price_el else None

        # Property facts
        facts = page.query_selector_all('span[data-testid="bed-bath-beyond"]')
        for fact in facts:
            text = fact.inner_text()
            if "bed" in text.lower():
                details['beds'] = text
            elif "bath" in text.lower():
                details['baths'] = text
            elif "sqft" in text.lower():
                details['sqft'] = text

        # Zestimate
        zestimate_el = page.query_selector('div[data-testid="zestimate-text"]')
        details['zestimate'] = zestimate_el.inner_text() if zestimate_el else None

        # Price history
        details['price_history'] = extract_price_history(page)

        browser.close()
        return details

def extract_price_history(page):
    """Extract price history from a Zillow listing page."""
    history = []

    # Try to expand price history section
    expand_btn = page.query_selector('button:has-text("See complete price history")')
    if expand_btn:
        expand_btn.click()
        import time
        time.sleep(2)

    rows = page.query_selector_all('table[class*="price-history"] tr')
    for row in rows[1:]:  # Skip header
        cells = row.query_selector_all('td')
        if len(cells) >= 3:
            history.append({
                "date": cells[0].inner_text(),
                "event": cells[1].inner_text(),
                "price": cells[2].inner_text(),
            })

    return history
Enter fullscreen mode Exit fullscreen mode

Market Analysis

import statistics

def analyze_market(properties):
    """Analyze a set of properties for market insights."""
    prices = [p["unformatted_price"] for p in properties if p["unformatted_price"]]
    sqft_prices = [p["price_per_sqft"] for p in properties if p["price_per_sqft"]]

    analysis = {
        "total_listings": len(properties),
        "price_stats": {
            "median": statistics.median(prices) if prices else None,
            "mean": round(statistics.mean(prices)) if prices else None,
            "min": min(prices) if prices else None,
            "max": max(prices) if prices else None,
            "stdev": round(statistics.stdev(prices)) if len(prices) > 1 else None,
        },
        "price_per_sqft_stats": {
            "median": round(statistics.median(sqft_prices)) if sqft_prices else None,
            "mean": round(statistics.mean(sqft_prices)) if sqft_prices else None,
        },
        "by_bedrooms": {},
    }

    # Group by bedrooms
    from collections import defaultdict
    by_beds = defaultdict(list)
    for p in properties:
        beds = p.get("beds")
        if beds and p.get("unformatted_price"):
            by_beds[beds].append(p["unformatted_price"])

    for beds, bed_prices in sorted(by_beds.items()):
        analysis["by_bedrooms"][f"{beds}_bed"] = {
            "count": len(bed_prices),
            "median_price": statistics.median(bed_prices),
        }

    return analysis
Enter fullscreen mode Exit fullscreen mode

Data Export

import csv

def export_properties(properties, filename="zillow_data.csv"):
    if not properties:
        return

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=properties[0].keys())
        writer.writeheader()
        writer.writerows(properties)

    print(f"Exported {len(properties)} properties to {filename}")

def export_analysis(analysis, filename="market_analysis.json"):
    with open(filename, "w") as f:
        json.dump(analysis, f, indent=2)
    print(f"Analysis saved to {filename}")
Enter fullscreen mode Exit fullscreen mode

Running the Pipeline

if __name__ == "__main__":
    import time

    # Search for properties
    print("Searching Zillow...")
    api_data = search_zillow_api("San Francisco CA")
    properties = parse_search_results(api_data)
    print(f"Found {len(properties)} properties")

    # Export raw data
    export_properties(properties)

    # Market analysis
    analysis = analyze_market(properties)
    export_analysis(analysis)

    print(f"\nMarket Summary:")
    print(f"  Median Price: ${analysis['price_stats']['median']:,.0f}")
    print(f"  Median $/sqft: ${analysis['price_per_sqft_stats']['median']:,}")
    print(f"  Listings: {analysis['total_listings']}")
Enter fullscreen mode Exit fullscreen mode

Proxy Recommendations

Zillow has aggressive anti-scraping measures. For reliable data collection, use ScrapeOps which provides proxy rotation and request management optimized for real estate sites.

Legal Disclaimer

Zillow's Terms of Service prohibit automated data collection. This guide is for educational purposes. For production use, consider Zillow's official API or licensed data partnerships.

Conclusion

Zillow real estate scraping enables powerful property analysis and market intelligence. Start with the hidden API endpoints for search results, then use browser automation for individual property details. Always use proper proxy rotation and respect rate limits.

Top comments (0)