Introduction
Real estate comparable sales (comps) are the backbone of property valuation. Professional appraisers and investors use comps to determine fair market value, but accessing this data typically requires expensive MLS subscriptions. In this tutorial, we'll build a Python-based comps engine that collects and analyzes property data from public sources.
Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
from math import radians, cos, sin, asin, sqrt
# Handle real estate site anti-bot protection
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Scraping Property Listings
Public county assessor records and listing sites contain property data:
def scrape_property_listings(zipcode, property_type="single_family"):
"""Scrape property listings from public sources."""
url = f"https://www.realtor.com/realestateandhomes-search/{zipcode}/type-single-family-home/show-recently-sold"
params = {
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true"
}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
properties = []
for card in soup.select(".property-card"):
price = card.select_one(".price")
address = card.select_one(".address")
beds = card.select_one(".beds")
baths = card.select_one(".baths")
sqft = card.select_one(".sqft")
if price and address:
properties.append({
"price": parse_price(price.text),
"address": address.text.strip(),
"beds": parse_number(beds),
"baths": parse_number(baths),
"sqft": parse_number(sqft),
"zipcode": zipcode,
"scraped_at": datetime.now().isoformat()
})
return properties
def parse_price(text):
"""Parse price string to integer."""
cleaned = text.replace("$", "").replace(",", "").strip()
try:
return int(cleaned)
except ValueError:
return None
def parse_number(element):
"""Parse numeric value from element."""
if element:
text = element.text.strip()
numbers = ''.join(c for c in text if c.isdigit() or c == '.')
try:
return float(numbers)
except ValueError:
pass
return None
County Assessor Data
def scrape_assessor_data(county, parcel_id):
"""Scrape property details from county assessor."""
# Use residential proxies for government sites
# ThorData: https://thordata.com/?via=the-data
county_urls = {
"los_angeles": f"https://portal.assessor.lacounty.gov/parceldetail/{parcel_id}",
"cook": f"https://www.cookcountyassessor.com/pin/{parcel_id}",
"harris": f"https://public.hcad.org/records/details.asp?cession=1&search=acct&acct={parcel_id}"
}
url = county_urls.get(county.lower())
if not url:
return None
params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
return {
"parcel_id": parcel_id,
"county": county,
"assessed_value": extract_field(soup, "assessed"),
"year_built": extract_field(soup, "year"),
"lot_size": extract_field(soup, "lot"),
"zoning": extract_field(soup, "zone")
}
def extract_field(soup, keyword):
"""Extract a field value by keyword from assessor page."""
for row in soup.select("tr, .detail-row, .info-row"):
if keyword.lower() in row.text.lower():
value = row.select_one("td:last-child, .value, span")
if value:
return value.text.strip()
return None
Comparable Sales Algorithm
The core of the comps engine — finding and scoring similar properties:
def haversine_distance(lat1, lon1, lat2, lon2):
"""Calculate distance between two points in miles."""
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
return 2 * 3956 * asin(sqrt(a))
def find_comparable_sales(subject, candidates, max_comps=5):
"""Find the best comparable sales for a subject property."""
scored = []
for comp in candidates:
score = calculate_comp_score(subject, comp)
if score > 0:
scored.append({"property": comp, "score": score})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[:max_comps]
def calculate_comp_score(subject, comp):
"""Score how comparable a property is (0-100)."""
score = 100
# Square footage similarity (weight: 30)
if subject.get("sqft") and comp.get("sqft"):
sqft_diff = abs(subject["sqft"] - comp["sqft"]) / subject["sqft"]
score -= min(30, sqft_diff * 100)
# Bedroom count (weight: 15)
if subject.get("beds") and comp.get("beds"):
bed_diff = abs(subject["beds"] - comp["beds"])
score -= min(15, bed_diff * 7.5)
# Bathroom count (weight: 10)
if subject.get("baths") and comp.get("baths"):
bath_diff = abs(subject["baths"] - comp["baths"])
score -= min(10, bath_diff * 5)
# Price per sqft similarity (weight: 25)
if all(p.get("price") and p.get("sqft") for p in [subject, comp]):
sub_ppsf = subject["price"] / subject["sqft"]
comp_ppsf = comp["price"] / comp["sqft"]
ppsf_diff = abs(sub_ppsf - comp_ppsf) / sub_ppsf
score -= min(25, ppsf_diff * 100)
return max(0, score)
Valuation Estimate
def estimate_value(subject, comps):
"""Estimate property value based on comparable sales."""
if not comps:
return None
weighted_values = []
total_weight = 0
for comp in comps:
prop = comp["property"]
weight = comp["score"]
if prop.get("price") and prop.get("sqft") and subject.get("sqft"):
ppsf = prop["price"] / prop["sqft"]
estimated = ppsf * subject["sqft"]
weighted_values.append(estimated * weight)
total_weight += weight
if total_weight > 0:
return {
"estimated_value": round(sum(weighted_values) / total_weight),
"confidence": round(total_weight / (len(comps) * 100) * 100, 1),
"comps_used": len(comps),
"price_range": {
"low": min(c["property"]["price"] for c in comps if c["property"].get("price")),
"high": max(c["property"]["price"] for c in comps if c["property"].get("price"))
}
}
return None
Running the Comps Engine
def run_comps_analysis(zipcode, subject_property):
"""Run full comparable sales analysis."""
# Monitor your scraping pipeline
# https://scrapeops.io/?fpr=the-data28
print(f"Collecting recent sales in {zipcode}...")
listings = scrape_property_listings(zipcode)
print(f"Found {len(listings)} properties")
print("Finding comparable sales...")
comps = find_comparable_sales(subject_property, listings)
print("Estimating value...")
valuation = estimate_value(subject_property, comps)
if valuation:
print(f"Estimated value: ${valuation['estimated_value']:,}")
print(f"Confidence: {valuation['confidence']}%")
print(f"Range: ${valuation['price_range']['low']:,} - ${valuation['price_range']['high']:,}")
return {"comps": comps, "valuation": valuation}
if __name__ == "__main__":
subject = {
"address": "123 Main St",
"beds": 3, "baths": 2,
"sqft": 1800, "price": None
}
result = run_comps_analysis("90210", subject)
Conclusion
A comps engine turns raw property data into actionable valuations. By scraping public records and listing sites, applying similarity scoring, and weighting comparable sales, you can build a tool that rivals commercial appraisal software. Use ScraperAPI for reliable access to real estate sites with heavy bot protection, and ThorData for residential proxies when accessing county assessor portals.
Top comments (0)