Why Build a Price Comparison Tool?
Price comparison tools are one of the most practical web scraping projects you can build. Whether it's for personal use (finding the best deals) or a business application (competitive pricing intelligence), the fundamentals are the same: scrape prices from multiple sources, normalize the data, and present the results.
In this tutorial, I'll walk you through building a multi-site price scraper from scratch.
Architecture Overview
Our price comparison tool has four components:
- Scrapers — Site-specific modules that extract product data
- Normalizer — Cleans and standardizes data across sources
- Storage — SQLite database for price history
- Reporter — Generates comparison output
┌─────────────┐ ┌────────────┐ ┌──────────┐ ┌──────────┐
│ Scraper A │────▶│ │────▶│ │────▶│ │
│ Scraper B │────▶│ Normalizer │────▶│ SQLite │────▶│ Reporter │
│ Scraper C │────▶│ │────▶│ │────▶│ │
└─────────────┘ └────────────┘ └──────────┘ └──────────┘
Setting Up
pip install requests beautifulsoup4 playwright pandas
playwright install chromium
The Base Scraper Class
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
import hashlib
@dataclass
class Product:
name: str
price: float
currency: str
url: str
source: str
image_url: Optional[str] = None
in_stock: bool = True
@property
def product_id(self):
return hashlib.md5(f'{self.source}:{self.url}'.encode()).hexdigest()
class BaseScraper(ABC):
def __init__(self, proxy_url=None):
self.proxy_url = proxy_url
self.session = requests.Session()
if proxy_url:
self.session.proxies = {'http': proxy_url, 'https': proxy_url}
@abstractmethod
def search(self, query: str) -> list[Product]:
pass
@abstractmethod
def get_product(self, url: str) -> Optional[Product]:
pass
Building Site-Specific Scrapers
Amazon Scraper
import requests
from bs4 import BeautifulSoup
import re
import time
import random
class AmazonScraper(BaseScraper):
BASE_URL = 'https://www.amazon.com'
def search(self, query: str) -> list[Product]:
url = f'{self.BASE_URL}/s?k={query.replace(" ", "+")}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'en-US,en;q=0.9',
}
response = self.session.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
products = []
for item in soup.select('[data-component-type="s-search-result"]'):
try:
name_el = item.select_one('h2 a span')
price_whole = item.select_one('.a-price-whole')
price_frac = item.select_one('.a-price-fraction')
link = item.select_one('h2 a')
img = item.select_one('.s-image')
if not (name_el and price_whole):
continue
price_str = price_whole.get_text(strip=True).replace(',', '')
frac = price_frac.get_text(strip=True) if price_frac else '00'
price = float(f'{price_str}.{frac}')
products.append(Product(
name=name_el.get_text(strip=True),
price=price,
currency='USD',
url=self.BASE_URL + link.get('href') if link else '',
source='amazon',
image_url=img.get('src') if img else None,
))
except (ValueError, AttributeError):
continue
return products
def get_product(self, url):
response = self.session.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
soup = BeautifulSoup(response.text, 'html.parser')
name = soup.select_one('#productTitle')
price = soup.select_one('.a-price .a-offscreen')
if not (name and price):
return None
price_val = float(re.sub(r'[^\d.]', '', price.get_text()))
return Product(
name=name.get_text(strip=True),
price=price_val,
currency='USD',
url=url,
source='amazon',
)
eBay Scraper
class EbayScraper(BaseScraper):
BASE_URL = 'https://www.ebay.com'
def search(self, query: str) -> list[Product]:
url = f'{self.BASE_URL}/sch/i.html?_nkw={query.replace(" ", "+")}'
response = self.session.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
soup = BeautifulSoup(response.text, 'html.parser')
products = []
for item in soup.select('.s-item'):
name_el = item.select_one('.s-item__title')
price_el = item.select_one('.s-item__price')
link_el = item.select_one('.s-item__link')
img_el = item.select_one('.s-item__image-img')
if not (name_el and price_el):
continue
price_text = price_el.get_text(strip=True)
price_match = re.search(r'[\d,]+\.\d{2}', price_text)
if not price_match:
continue
products.append(Product(
name=name_el.get_text(strip=True),
price=float(price_match.group().replace(',', '')),
currency='USD',
url=link_el.get('href') if link_el else '',
source='ebay',
image_url=img_el.get('src') if img_el else None,
))
return products
def get_product(self, url):
return None # Simplified for this tutorial
Price Database
import sqlite3
from datetime import datetime
class PriceDatabase:
def __init__(self, db_path='prices.db'):
self.conn = sqlite3.connect(db_path)
self.create_tables()
def create_tables(self):
self.conn.execute('''
CREATE TABLE IF NOT EXISTS prices (
id INTEGER PRIMARY KEY AUTOINCREMENT,
product_id TEXT,
name TEXT,
price REAL,
currency TEXT,
source TEXT,
url TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
self.conn.commit()
def save_product(self, product: Product):
self.conn.execute(
'INSERT INTO prices (product_id, name, price, currency, source, url) VALUES (?, ?, ?, ?, ?, ?)',
(product.product_id, product.name, product.price, product.currency, product.source, product.url)
)
self.conn.commit()
def get_price_history(self, product_id):
cursor = self.conn.execute(
'SELECT price, scraped_at FROM prices WHERE product_id = ? ORDER BY scraped_at',
(product_id,)
)
return cursor.fetchall()
def get_best_prices(self, name_query):
cursor = self.conn.execute('''
SELECT name, price, source, url, scraped_at
FROM prices
WHERE name LIKE ?
ORDER BY price ASC
LIMIT 20
''', (f'%{name_query}%',))
return cursor.fetchall()
The Comparison Engine
import pandas as pd
class PriceComparer:
def __init__(self):
self.scrapers = [
AmazonScraper(),
EbayScraper(),
]
self.db = PriceDatabase()
def compare(self, query: str) -> pd.DataFrame:
all_products = []
for scraper in self.scrapers:
try:
products = scraper.search(query)
for p in products:
self.db.save_product(p)
all_products.extend(products)
print(f'{scraper.__class__.__name__}: found {len(products)} results')
except Exception as e:
print(f'{scraper.__class__.__name__} failed: {e}')
time.sleep(random.uniform(1, 3))
if not all_products:
return pd.DataFrame()
df = pd.DataFrame([
{'name': p.name, 'price': p.price, 'source': p.source,
'url': p.url, 'in_stock': p.in_stock}
for p in all_products
])
return df.sort_values('price').head(20)
# Run comparison
comparer = PriceComparer()
results = comparer.compare('wireless mouse')
print(results[['name', 'price', 'source']].to_string())
Scaling with Proxies
When scraping multiple sites, you'll quickly hit rate limits. Using a proxy service like ThorData with rotating residential IPs solves this:
proxy_url = 'http://user:pass@proxy.thordata.com:9000'
scrapers = [
AmazonScraper(proxy_url=proxy_url),
EbayScraper(proxy_url=proxy_url),
]
Scheduling Price Checks
import schedule
def daily_check():
comparer = PriceComparer()
products = ['wireless mouse', 'mechanical keyboard', 'usb-c hub']
for query in products:
results = comparer.compare(query)
if not results.empty:
best = results.iloc[0]
print(f'Best {query}: ${best["price"]:.2f} at {best["source"]}')
time.sleep(5)
schedule.every().day.at('09:00').do(daily_check)
while True:
schedule.run_pending()
time.sleep(60)
Conclusion
A price comparison tool is a great way to learn multi-site scraping. The key challenges are normalizing data across sources and handling anti-bot measures. For serious scraping workloads, a reliable proxy service like ThorData keeps your scrapers running smoothly across all target sites.
The full code from this tutorial gives you a foundation to build on — add more retailers, implement price alerts, or build a web dashboard.
Top comments (0)