How to Scrape Real Estate Listings: Zillow, Redfin, Realtor.com in 2026

#webscraping #python #javascript #data

Real estate data is worth billions — but most of it is siloed behind platforms charging $500-5,000/month for API access. Zillow, Redfin, Realtor.com, and Trulia all have the same listing data. Here's how to extract it.

What's available

Real estate platforms expose:

Property listings: price, beds/baths, square footage, lot size
Property details: year built, HOA fees, tax history, price history
Photos and virtual tours
Agent/broker contact info
Days on market, listing status
Neighborhood data: schools, walkability, commute times
Recent sold comps

Method 1: Zillow (most data, aggressive anti-bot)

Zillow has the most comprehensive US property data but blocks scrapers aggressively:

from playwright.async_api import async_playwright
import asyncio
import json

async def scrape_zillow_listing(url: str) -> dict:
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--disable-blink-features=AutomationControlled"]
        )
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0 Safari/537.36",
            viewport={"width": 1440, "height": 900}
        )
        page = await context.new_page()

        # Block tracking scripts to reduce fingerprint surface
        await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", lambda r: r.abort())
        await page.goto(url, wait_until="domcontentloaded")

        # Zillow embeds structured data in a script tag
        data = await page.evaluate("""
            () => {
                const scripts = document.querySelectorAll('script[type="application/ld+json"]');
                for (const s of scripts) {
                    try {
                        const d = JSON.parse(s.innerText);
                        if (d['@type'] === 'SingleFamilyResidence' || d['@type'] === 'Residence') {
                            return d;
                        }
                    } catch(e) {}
                }
                return null;
            }
        """)

        # Also extract from the __NEXT_DATA__ state
        next_data = await page.evaluate("""
            () => {
                const el = document.getElementById('__NEXT_DATA__');
                if (el) {
                    try {
                        const d = JSON.parse(el.innerText);
                        return d.props?.pageProps?.gdpClientCache || null;
                    } catch(e) {}
                }
                return null;
            }
        """)

        await browser.close()
        return {"structured": data, "raw": next_data}

# Test on a listing
listing = asyncio.run(scrape_zillow_listing(
    "https://www.zillow.com/homedetails/123-main-st-austin-tx-78701/12345678_zpid/"
))

Method 2: Redfin (more scraper-friendly)

Redfin has an unofficial API that returns JSON:

import requests

def search_redfin_listings(city: str, state: str = "CA") -> list:
    # First, get region ID
    search_url = "https://www.redfin.com/stingray/do/location-autocomplete"
    params = {"location": f"{city}, {state}", "start": 0, "count": 10, "v": 2}
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0"}

    r = requests.get(search_url, params=params, headers=headers)

    # Parse region from response
    # Then use region ID to get listings
    # This endpoint changes frequently — use the Apify actor for reliability
    pass

def get_redfin_listing_details(listing_url: str) -> dict:
    """Extract listing data from Redfin page HTML"""
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0"}

    response = requests.get(listing_url, headers=headers)

    if response.status_code == 200:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract key stats
        price = soup.select_one("[data-rf-test-id='abp-price']")
        beds = soup.select_one("[data-rf-test-id='abp-beds']")
        baths = soup.select_one("[data-rf-test-id='abp-baths']")
        sqft = soup.select_one("[data-rf-test-id='abp-sqFt']")

        return {
            "price": price.text if price else None,
            "beds": beds.text if beds else None,
            "baths": baths.text if baths else None,
            "sqft": sqft.text if sqft else None,
        }
    return {}

Method 3: Realtor.com (structured data available)

Realtor.com includes JSON-LD structured data on listing pages:

import requests
from bs4 import BeautifulSoup
import json

def scrape_realtor_listing(url: str) -> dict:
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Chrome/122.0.0.0"}

    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return {}

    soup = BeautifulSoup(r.text, "html.parser")

    # JSON-LD structured data
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)
            if data.get("@type") in ["SingleFamilyResidence", "RealEstateListing"]:
                return {
                    "name": data.get("name"),
                    "price": data.get("offers", {}).get("price"),
                    "address": data.get("address", {}).get("streetAddress"),
                    "city": data.get("address", {}).get("addressLocality"),
                    "state": data.get("address", {}).get("addressRegion"),
                    "beds": data.get("numberOfRooms"),
                    "description": data.get("description", "")[:300],
                }
        except (json.JSONDecodeError, AttributeError):
            continue

    return {}

Method 4: Pre-built real estate tracker

The Real Estate Tracker on Apify handles Zillow, Redfin, and Realtor.com with proxy rotation and anti-bot.

Input: city/neighborhood, property type, price range filters
Output: standardized property listings with price history

48+ production runs. Pay-per-result pricing.

Building a price tracker

Track price changes on properties you're monitoring:

import sqlite3
from datetime import datetime

def init_db():
    conn = sqlite3.connect("real_estate.db")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS listings (
            zpid TEXT,
            address TEXT,
            price REAL,
            beds INTEGER,
            baths REAL,
            sqft INTEGER,
            status TEXT,
            scraped_at TIMESTAMP
        )
    """)
    conn.commit()
    return conn

def track_listing(conn, listing_data: dict):
    # Insert new record (keeps history)
    conn.execute(
        "INSERT INTO listings VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
        (
            listing_data.get("zpid"),
            listing_data.get("address"),
            listing_data.get("price"),
            listing_data.get("beds"),
            listing_data.get("baths"),
            listing_data.get("sqft"),
            listing_data.get("status"),
            datetime.now().isoformat()
        )
    )
    conn.commit()

def get_price_history(conn, zpid: str) -> list:
    cursor = conn.execute(
        "SELECT price, status, scraped_at FROM listings WHERE zpid = ? ORDER BY scraped_at",
        (zpid,)
    )
    return cursor.fetchall()

Use cases

Investment analysis: Monitor properties matching your criteria for price drops
Market research: Track median prices and days-on-market by neighborhood
Lead generation: Agent contact info from listings in specific price ranges
Price prediction: Build ML models on historical price change data
Rental arbitrage: Compare purchase price vs estimated rental income