DEV Community

agenthustler
agenthustler

Posted on

Build a Price Comparison Tool with Python Web Scraping

Why Build a Price Comparison Tool?

Price comparison tools are one of the most practical web scraping projects you can build. Whether it's for personal use (finding the best deals) or a business application (competitive pricing intelligence), the fundamentals are the same: scrape prices from multiple sources, normalize the data, and present the results.

In this tutorial, I'll walk you through building a multi-site price scraper from scratch.

Architecture Overview

Our price comparison tool has four components:

  1. Scrapers — Site-specific modules that extract product data
  2. Normalizer — Cleans and standardizes data across sources
  3. Storage — SQLite database for price history
  4. Reporter — Generates comparison output
┌─────────────┐     ┌────────────┐     ┌──────────┐     ┌──────────┐
│  Scraper A  │────▶│            │────▶│          │────▶│          │
│  Scraper B  │────▶│ Normalizer │────▶│  SQLite  │────▶│ Reporter │
│  Scraper C  │────▶│            │────▶│          │────▶│          │
└─────────────┘     └────────────┘     └──────────┘     └──────────┘
Enter fullscreen mode Exit fullscreen mode

Setting Up

pip install requests beautifulsoup4 playwright pandas
playwright install chromium
Enter fullscreen mode Exit fullscreen mode

The Base Scraper Class

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
import hashlib

@dataclass
class Product:
    name: str
    price: float
    currency: str
    url: str
    source: str
    image_url: Optional[str] = None
    in_stock: bool = True

    @property
    def product_id(self):
        return hashlib.md5(f'{self.source}:{self.url}'.encode()).hexdigest()

class BaseScraper(ABC):
    def __init__(self, proxy_url=None):
        self.proxy_url = proxy_url
        self.session = requests.Session()
        if proxy_url:
            self.session.proxies = {'http': proxy_url, 'https': proxy_url}

    @abstractmethod
    def search(self, query: str) -> list[Product]:
        pass

    @abstractmethod
    def get_product(self, url: str) -> Optional[Product]:
        pass
Enter fullscreen mode Exit fullscreen mode

Building Site-Specific Scrapers

Amazon Scraper

import requests
from bs4 import BeautifulSoup
import re
import time
import random

class AmazonScraper(BaseScraper):
    BASE_URL = 'https://www.amazon.com'

    def search(self, query: str) -> list[Product]:
        url = f'{self.BASE_URL}/s?k={query.replace(" ", "+")}'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
        }

        response = self.session.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        products = []

        for item in soup.select('[data-component-type="s-search-result"]'):
            try:
                name_el = item.select_one('h2 a span')
                price_whole = item.select_one('.a-price-whole')
                price_frac = item.select_one('.a-price-fraction')
                link = item.select_one('h2 a')
                img = item.select_one('.s-image')

                if not (name_el and price_whole):
                    continue

                price_str = price_whole.get_text(strip=True).replace(',', '')
                frac = price_frac.get_text(strip=True) if price_frac else '00'
                price = float(f'{price_str}.{frac}')

                products.append(Product(
                    name=name_el.get_text(strip=True),
                    price=price,
                    currency='USD',
                    url=self.BASE_URL + link.get('href') if link else '',
                    source='amazon',
                    image_url=img.get('src') if img else None,
                ))
            except (ValueError, AttributeError):
                continue

        return products

    def get_product(self, url):
        response = self.session.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        soup = BeautifulSoup(response.text, 'html.parser')

        name = soup.select_one('#productTitle')
        price = soup.select_one('.a-price .a-offscreen')

        if not (name and price):
            return None

        price_val = float(re.sub(r'[^\d.]', '', price.get_text()))

        return Product(
            name=name.get_text(strip=True),
            price=price_val,
            currency='USD',
            url=url,
            source='amazon',
        )
Enter fullscreen mode Exit fullscreen mode

eBay Scraper

class EbayScraper(BaseScraper):
    BASE_URL = 'https://www.ebay.com'

    def search(self, query: str) -> list[Product]:
        url = f'{self.BASE_URL}/sch/i.html?_nkw={query.replace(" ", "+")}'
        response = self.session.get(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        soup = BeautifulSoup(response.text, 'html.parser')
        products = []

        for item in soup.select('.s-item'):
            name_el = item.select_one('.s-item__title')
            price_el = item.select_one('.s-item__price')
            link_el = item.select_one('.s-item__link')
            img_el = item.select_one('.s-item__image-img')

            if not (name_el and price_el):
                continue

            price_text = price_el.get_text(strip=True)
            price_match = re.search(r'[\d,]+\.\d{2}', price_text)
            if not price_match:
                continue

            products.append(Product(
                name=name_el.get_text(strip=True),
                price=float(price_match.group().replace(',', '')),
                currency='USD',
                url=link_el.get('href') if link_el else '',
                source='ebay',
                image_url=img_el.get('src') if img_el else None,
            ))

        return products

    def get_product(self, url):
        return None  # Simplified for this tutorial
Enter fullscreen mode Exit fullscreen mode

Price Database

import sqlite3
from datetime import datetime

class PriceDatabase:
    def __init__(self, db_path='prices.db'):
        self.conn = sqlite3.connect(db_path)
        self.create_tables()

    def create_tables(self):
        self.conn.execute('''
            CREATE TABLE IF NOT EXISTS prices (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                product_id TEXT,
                name TEXT,
                price REAL,
                currency TEXT,
                source TEXT,
                url TEXT,
                scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        self.conn.commit()

    def save_product(self, product: Product):
        self.conn.execute(
            'INSERT INTO prices (product_id, name, price, currency, source, url) VALUES (?, ?, ?, ?, ?, ?)',
            (product.product_id, product.name, product.price, product.currency, product.source, product.url)
        )
        self.conn.commit()

    def get_price_history(self, product_id):
        cursor = self.conn.execute(
            'SELECT price, scraped_at FROM prices WHERE product_id = ? ORDER BY scraped_at',
            (product_id,)
        )
        return cursor.fetchall()

    def get_best_prices(self, name_query):
        cursor = self.conn.execute('''
            SELECT name, price, source, url, scraped_at
            FROM prices
            WHERE name LIKE ?
            ORDER BY price ASC
            LIMIT 20
        ''', (f'%{name_query}%',))
        return cursor.fetchall()
Enter fullscreen mode Exit fullscreen mode

The Comparison Engine

import pandas as pd

class PriceComparer:
    def __init__(self):
        self.scrapers = [
            AmazonScraper(),
            EbayScraper(),
        ]
        self.db = PriceDatabase()

    def compare(self, query: str) -> pd.DataFrame:
        all_products = []

        for scraper in self.scrapers:
            try:
                products = scraper.search(query)
                for p in products:
                    self.db.save_product(p)
                all_products.extend(products)
                print(f'{scraper.__class__.__name__}: found {len(products)} results')
            except Exception as e:
                print(f'{scraper.__class__.__name__} failed: {e}')
            time.sleep(random.uniform(1, 3))

        if not all_products:
            return pd.DataFrame()

        df = pd.DataFrame([
            {'name': p.name, 'price': p.price, 'source': p.source, 
             'url': p.url, 'in_stock': p.in_stock}
            for p in all_products
        ])

        return df.sort_values('price').head(20)

# Run comparison
comparer = PriceComparer()
results = comparer.compare('wireless mouse')
print(results[['name', 'price', 'source']].to_string())
Enter fullscreen mode Exit fullscreen mode

Scaling with Proxies

When scraping multiple sites, you'll quickly hit rate limits. Using a proxy service like ThorData with rotating residential IPs solves this:

proxy_url = 'http://user:pass@proxy.thordata.com:9000'

scrapers = [
    AmazonScraper(proxy_url=proxy_url),
    EbayScraper(proxy_url=proxy_url),
]
Enter fullscreen mode Exit fullscreen mode

Scheduling Price Checks

import schedule

def daily_check():
    comparer = PriceComparer()
    products = ['wireless mouse', 'mechanical keyboard', 'usb-c hub']

    for query in products:
        results = comparer.compare(query)
        if not results.empty:
            best = results.iloc[0]
            print(f'Best {query}: ${best["price"]:.2f} at {best["source"]}')
        time.sleep(5)

schedule.every().day.at('09:00').do(daily_check)

while True:
    schedule.run_pending()
    time.sleep(60)
Enter fullscreen mode Exit fullscreen mode

Conclusion

A price comparison tool is a great way to learn multi-site scraping. The key challenges are normalizing data across sources and handling anti-bot measures. For serious scraping workloads, a reliable proxy service like ThorData keeps your scrapers running smoothly across all target sites.

The full code from this tutorial gives you a foundation to build on — add more retailers, implement price alerts, or build a web dashboard.

Top comments (0)