Why Real Estate Data Pipelines Matter
Real estate investors and analysts need fresh, structured data from multiple listing sites. Building an automated pipeline saves hours of manual research and lets you spot deals before competitors.
In this guide, we'll build a Python pipeline that collects property data from major real estate platforms, normalizes it, and stores it for analysis.
Architecture Overview
Our pipeline follows a simple ETL pattern:
- Extract — Fetch listing pages via API proxy
- Transform — Parse HTML into structured data
- Load — Store in SQLite for querying
Setting Up the Scraper
First, install the dependencies:
pip install requests beautifulsoup4 pandas sqlite3
The Core Scraper Class
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import time
import json
class RealEstatePipeline:
def __init__(self, api_key):
self.session = requests.Session()
self.api_key = api_key
self.base_url = "https://api.scraperapi.com"
self.db = sqlite3.connect("real_estate.db")
self._create_tables()
def _create_tables(self):
self.db.execute("""
CREATE TABLE IF NOT EXISTS listings (
id TEXT PRIMARY KEY,
source TEXT,
address TEXT,
price REAL,
bedrooms INTEGER,
bathrooms REAL,
sqft INTEGER,
listing_url TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
self.db.commit()
def fetch_page(self, url):
"""Fetch a page through ScraperAPI to handle blocks."""
params = {
"api_key": self.api_key,
"url": url,
"render": "true"
}
resp = self.session.get(self.base_url, params=params)
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
def parse_listings(self, soup, source):
"""Extract listing data from parsed HTML."""
listings = []
cards = soup.select(".property-card, .HomeCard, .listing-card")
for card in cards:
price_el = card.select_one(".price, [data-test=property-card-price]")
addr_el = card.select_one(".address, [data-test=property-card-addr]")
if price_el and addr_el:
listings.append({
"source": source,
"address": addr_el.get_text(strip=True),
"price": self._parse_price(price_el.get_text()),
"bedrooms": self._extract_beds(card),
"bathrooms": self._extract_baths(card),
"sqft": self._extract_sqft(card),
})
return listings
def _parse_price(self, text):
return float(text.replace("$", "").replace(",", "").strip())
def _extract_beds(self, card):
el = card.select_one(".beds, [data-test=bed]")
if el:
try:
return int(el.get_text().split()[0])
except ValueError:
return None
return None
def _extract_baths(self, card):
el = card.select_one(".baths, [data-test=bath]")
if el:
try:
return float(el.get_text().split()[0])
except ValueError:
return None
return None
def _extract_sqft(self, card):
el = card.select_one(".sqft, [data-test=sqft]")
if el:
try:
return int(el.get_text().replace(",", "").split()[0])
except ValueError:
return None
return None
def save_listings(self, listings):
df = pd.DataFrame(listings)
df.to_sql("listings", self.db, if_exists="append", index=False)
print(f"Saved {len(listings)} listings")
def run(self, urls):
for source, url in urls.items():
print(f"Scraping {source}...")
soup = self.fetch_page(url)
listings = self.parse_listings(soup, source)
self.save_listings(listings)
time.sleep(2) # Be respectful
Running the Pipeline
pipeline = RealEstatePipeline(api_key="YOUR_SCRAPERAPI_KEY")
urls = {
"zillow": "https://www.zillow.com/homes/for_sale/Austin-TX/",
"redfin": "https://www.redfin.com/city/30818/TX/Austin",
"realtor": "https://www.realtor.com/realestateandhomes-search/Austin_TX"
}
pipeline.run(urls)
Handling Anti-Bot Protections
Real estate sites use aggressive bot detection. Using a proxy API like ScraperAPI handles CAPTCHAs, IP rotation, and browser fingerprinting automatically.
For high-volume scraping, ThorData provides residential proxies that blend in with normal traffic.
Scheduling and Monitoring
Use cron or a task scheduler to run your pipeline daily:
# Run every day at 6 AM
0 6 * * * cd /home/user/pipeline && python3 scraper.py >> logs/scrape.log 2>&1
Monitor your scraping jobs with ScrapeOps — it tracks success rates, response times, and alerts you when something breaks.
Analyzing the Data
import pandas as pd
import sqlite3
db = sqlite3.connect("real_estate.db")
df = pd.read_sql("SELECT * FROM listings WHERE price > 0", db)
# Price distribution by source
print(df.groupby("source")["price"].describe())
# Best deals: lowest price per sqft
df["price_per_sqft"] = df["price"] / df["sqft"]
best_deals = df.nsmallest(10, "price_per_sqft")
print(best_deals[["address", "price", "sqft", "price_per_sqft"]])
Key Takeaways
- Use proxy APIs to avoid getting blocked by real estate sites
- Normalize data from different sources into a common schema
- Store everything in a database for historical trend analysis
- Schedule runs to catch new listings early
- Always respect robots.txt and rate-limit your requests
Building a real estate data pipeline is one of the most practical scraping projects — the data has direct monetary value for investors and analysts.
Top comments (0)