Insurance comparison sites aggregate quotes from dozens of providers. Scraping this data reveals pricing patterns, regional differences, and competitive dynamics in the insurance market.
The Insurance Data Opportunity
Insurance comparison sites like Policygenius, The Zebra, and NerdWallet publish rate ranges, provider rankings, and coverage details. This data is valuable for market research and competitive analysis.
Setting Up the Scraper
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
API_KEY = "YOUR_SCRAPERAPI_KEY" # Get one at https://www.scraperapi.com?fp_ref=the52
def scrape_insurance_comparator(insurance_type, state):
url = f"https://www.thezebra.com/{insurance_type}/{state}/"
proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}&render=true"
response = requests.get(proxy_url, timeout=60)
soup = BeautifulSoup(response.text, 'html.parser')
quotes = []
for card in soup.select('.company-card, .rate-card'):
company = card.select_one('.company-name, h3')
rate = card.select_one('.rate, .price')
if company:
quotes.append({
'insurance_type': insurance_type,
'state': state,
'company': company.text.strip(),
'rate': rate.text.strip() if rate else 'N/A',
'scraped_at': datetime.now().isoformat()
})
return quotes
states = ['california', 'texas', 'florida', 'new-york', 'illinois']
types = ['auto-insurance', 'home-insurance', 'life-insurance']
all_quotes = []
for ins_type in types:
for state in states:
quotes = scrape_insurance_comparator(ins_type, state)
all_quotes.extend(quotes)
print(f"{ins_type}/{state}: {len(quotes)} providers")
df = pd.DataFrame(all_quotes)
df.to_csv('insurance_quotes.csv', index=False)
Parsing and Normalizing Rates
import re
def parse_monthly_rate(rate_str):
match = re.search(r'\$(\d+[,.]?\d*)', rate_str.replace(',', ''))
return float(match.group(1)) if match else None
df['monthly_rate'] = df['rate'].apply(parse_monthly_rate)
summary = df.groupby(['insurance_type', 'state'])['monthly_rate'].agg(['mean', 'min', 'max'])
print(summary.round(2).to_string())
Cross-Site Comparison
Use ThorData residential proxies to avoid detection:
def scrape_policygenius(insurance_type):
url = f"https://www.policygenius.com/{insurance_type}/"
proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}&render=true"
response = requests.get(proxy_url, timeout=60)
soup = BeautifulSoup(response.text, 'html.parser')
providers = []
for item in soup.select('.provider-listing'):
name = item.select_one('.provider-name')
if name:
providers.append(name.text.strip())
return providers
Visualizing Regional Differences
import matplotlib.pyplot as plt
auto = df[df['insurance_type'] == 'auto-insurance']
pivot = auto.pivot_table(values='monthly_rate', index='state', columns='company', aggfunc='mean')
pivot.plot(kind='bar', figsize=(14, 6), title='Auto Insurance Rates by State')
plt.ylabel('Monthly Rate ($)')
plt.tight_layout()
plt.savefig('insurance_rates.png', dpi=150)
Key Findings
- Auto insurance rates vary up to 300% between states for identical coverage
- Seasonal patterns exist — rates increase before winter in northern states
- Insurtechs consistently price 10-20% below incumbents initially
- Track with ScrapeOps and use ScraperAPI for JS rendering
Top comments (0)