DEV Community

agenthustler
agenthustler

Posted on

How to Build a Real Estate Comps Engine with Python

Introduction

Real estate comparable sales (comps) are the backbone of property valuation. Professional appraisers and investors use comps to determine fair market value, but accessing this data typically requires expensive MLS subscriptions. In this tutorial, we'll build a Python-based comps engine that collects and analyzes property data from public sources.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
from math import radians, cos, sin, asin, sqrt

# Handle real estate site anti-bot protection
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Enter fullscreen mode Exit fullscreen mode

Scraping Property Listings

Public county assessor records and listing sites contain property data:

def scrape_property_listings(zipcode, property_type="single_family"):
    """Scrape property listings from public sources."""
    url = f"https://www.realtor.com/realestateandhomes-search/{zipcode}/type-single-family-home/show-recently-sold"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    properties = []
    for card in soup.select(".property-card"):
        price = card.select_one(".price")
        address = card.select_one(".address")
        beds = card.select_one(".beds")
        baths = card.select_one(".baths")
        sqft = card.select_one(".sqft")

        if price and address:
            properties.append({
                "price": parse_price(price.text),
                "address": address.text.strip(),
                "beds": parse_number(beds),
                "baths": parse_number(baths),
                "sqft": parse_number(sqft),
                "zipcode": zipcode,
                "scraped_at": datetime.now().isoformat()
            })

    return properties

def parse_price(text):
    """Parse price string to integer."""
    cleaned = text.replace("$", "").replace(",", "").strip()
    try:
        return int(cleaned)
    except ValueError:
        return None

def parse_number(element):
    """Parse numeric value from element."""
    if element:
        text = element.text.strip()
        numbers = ''.join(c for c in text if c.isdigit() or c == '.')
        try:
            return float(numbers)
        except ValueError:
            pass
    return None
Enter fullscreen mode Exit fullscreen mode

County Assessor Data

def scrape_assessor_data(county, parcel_id):
    """Scrape property details from county assessor."""
    # Use residential proxies for government sites
    # ThorData: https://thordata.com/?via=the-data

    county_urls = {
        "los_angeles": f"https://portal.assessor.lacounty.gov/parceldetail/{parcel_id}",
        "cook": f"https://www.cookcountyassessor.com/pin/{parcel_id}",
        "harris": f"https://public.hcad.org/records/details.asp?cession=1&search=acct&acct={parcel_id}"
    }

    url = county_urls.get(county.lower())
    if not url:
        return None

    params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    return {
        "parcel_id": parcel_id,
        "county": county,
        "assessed_value": extract_field(soup, "assessed"),
        "year_built": extract_field(soup, "year"),
        "lot_size": extract_field(soup, "lot"),
        "zoning": extract_field(soup, "zone")
    }

def extract_field(soup, keyword):
    """Extract a field value by keyword from assessor page."""
    for row in soup.select("tr, .detail-row, .info-row"):
        if keyword.lower() in row.text.lower():
            value = row.select_one("td:last-child, .value, span")
            if value:
                return value.text.strip()
    return None
Enter fullscreen mode Exit fullscreen mode

Comparable Sales Algorithm

The core of the comps engine — finding and scoring similar properties:

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points in miles."""
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    return 2 * 3956 * asin(sqrt(a))

def find_comparable_sales(subject, candidates, max_comps=5):
    """Find the best comparable sales for a subject property."""
    scored = []

    for comp in candidates:
        score = calculate_comp_score(subject, comp)
        if score > 0:
            scored.append({"property": comp, "score": score})

    scored.sort(key=lambda x: x["score"], reverse=True)
    return scored[:max_comps]

def calculate_comp_score(subject, comp):
    """Score how comparable a property is (0-100)."""
    score = 100

    # Square footage similarity (weight: 30)
    if subject.get("sqft") and comp.get("sqft"):
        sqft_diff = abs(subject["sqft"] - comp["sqft"]) / subject["sqft"]
        score -= min(30, sqft_diff * 100)

    # Bedroom count (weight: 15)
    if subject.get("beds") and comp.get("beds"):
        bed_diff = abs(subject["beds"] - comp["beds"])
        score -= min(15, bed_diff * 7.5)

    # Bathroom count (weight: 10)
    if subject.get("baths") and comp.get("baths"):
        bath_diff = abs(subject["baths"] - comp["baths"])
        score -= min(10, bath_diff * 5)

    # Price per sqft similarity (weight: 25)
    if all(p.get("price") and p.get("sqft") for p in [subject, comp]):
        sub_ppsf = subject["price"] / subject["sqft"]
        comp_ppsf = comp["price"] / comp["sqft"]
        ppsf_diff = abs(sub_ppsf - comp_ppsf) / sub_ppsf
        score -= min(25, ppsf_diff * 100)

    return max(0, score)
Enter fullscreen mode Exit fullscreen mode

Valuation Estimate

def estimate_value(subject, comps):
    """Estimate property value based on comparable sales."""
    if not comps:
        return None

    weighted_values = []
    total_weight = 0

    for comp in comps:
        prop = comp["property"]
        weight = comp["score"]

        if prop.get("price") and prop.get("sqft") and subject.get("sqft"):
            ppsf = prop["price"] / prop["sqft"]
            estimated = ppsf * subject["sqft"]
            weighted_values.append(estimated * weight)
            total_weight += weight

    if total_weight > 0:
        return {
            "estimated_value": round(sum(weighted_values) / total_weight),
            "confidence": round(total_weight / (len(comps) * 100) * 100, 1),
            "comps_used": len(comps),
            "price_range": {
                "low": min(c["property"]["price"] for c in comps if c["property"].get("price")),
                "high": max(c["property"]["price"] for c in comps if c["property"].get("price"))
            }
        }
    return None
Enter fullscreen mode Exit fullscreen mode

Running the Comps Engine

def run_comps_analysis(zipcode, subject_property):
    """Run full comparable sales analysis."""
    # Monitor your scraping pipeline
    # https://scrapeops.io/?fpr=the-data28

    print(f"Collecting recent sales in {zipcode}...")
    listings = scrape_property_listings(zipcode)
    print(f"Found {len(listings)} properties")

    print("Finding comparable sales...")
    comps = find_comparable_sales(subject_property, listings)

    print("Estimating value...")
    valuation = estimate_value(subject_property, comps)

    if valuation:
        print(f"Estimated value: ${valuation['estimated_value']:,}")
        print(f"Confidence: {valuation['confidence']}%")
        print(f"Range: ${valuation['price_range']['low']:,} - ${valuation['price_range']['high']:,}")

    return {"comps": comps, "valuation": valuation}

if __name__ == "__main__":
    subject = {
        "address": "123 Main St",
        "beds": 3, "baths": 2,
        "sqft": 1800, "price": None
    }
    result = run_comps_analysis("90210", subject)
Enter fullscreen mode Exit fullscreen mode

Conclusion

A comps engine turns raw property data into actionable valuations. By scraping public records and listing sites, applying similarity scoring, and weighting comparable sales, you can build a tool that rivals commercial appraisal software. Use ScraperAPI for reliable access to real estate sites with heavy bot protection, and ThorData for residential proxies when accessing county assessor portals.

Top comments (0)