DEV Community

agenthustler
agenthustler

Posted on

How to Monitor Website Changes and Get Alerts with Python

How to Monitor Website Changes and Get Alerts with Python

Websites change constantly. Automated change detection saves hours of manual checking whether you track pricing, regulatory updates, or stock availability.

Core Monitor

import requests
from bs4 import BeautifulSoup
import hashlib, json, sqlite3, difflib
from datetime import datetime
import time, re

class WebMonitor:
    def __init__(self, db_path='monitor.db', api_key=None):
        self.db = sqlite3.connect(db_path)
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'ChangeMonitor/1.0'})
        self._init_db()

    def _init_db(self):
        self.db.executescript('''
            CREATE TABLE IF NOT EXISTS pages (
                url TEXT PRIMARY KEY, selector TEXT,
                last_hash TEXT, last_content TEXT,
                last_checked DATETIME, interval INTEGER DEFAULT 3600);
            CREATE TABLE IF NOT EXISTS changes (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                url TEXT, old TEXT, new TEXT,
                change_type TEXT, detected_at DATETIME, summary TEXT);
        ''')

    def _fetch(self, url):
        if self.api_key:
            return self.session.get(
                f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true")
        return self.session.get(url)

    def add_watch(self, url, selector=None, interval=3600):
        self.db.execute('INSERT OR REPLACE INTO pages (url,selector,interval) VALUES (?,?,?)',
                       (url, selector, interval))
        self.db.commit()
        self._check(url, selector)
Enter fullscreen mode Exit fullscreen mode

Smart Change Detection

    def _extract(self, html, selector=None):
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup.select('script, style, noscript, iframe'):
            tag.decompose()
        if selector:
            els = soup.select(selector)
            content = '\n'.join(e.get_text(strip=True) for e in els)
        else:
            content = soup.get_text(strip=True)
        return re.sub(r'\s+', ' ', content).strip()

    def _check(self, url, selector=None):
        try:
            resp = self._fetch(url)
            content = self._extract(resp.text, selector)
            h = hashlib.sha256(content.encode()).hexdigest()
            row = self.db.execute(
                'SELECT last_hash, last_content FROM pages WHERE url=?', (url,)
            ).fetchone()
            if row and row[0] and row[0] != h:
                change = self._analyze(row[1] or '', content)
                self.db.execute(
                    'INSERT INTO changes (url,old,new,change_type,detected_at,summary) VALUES (?,?,?,?,?,?)',
                    (url, (row[1] or '')[:1000], content[:1000], change['type'],
                     datetime.now().isoformat(), change['summary']))
                self.db.execute('UPDATE pages SET last_hash=?,last_content=?,last_checked=? WHERE url=?',
                    (h, content, datetime.now().isoformat(), url))
                self.db.commit()
                return change
            self.db.execute('UPDATE pages SET last_hash=?,last_content=?,last_checked=? WHERE url=?',
                (h, content, datetime.now().isoformat(), url))
            self.db.commit()
            return None
        except Exception as e:
            return {'error': str(e)}

    def _analyze(self, old, new):
        old_s = old.split('. ')
        new_s = new.split('. ')
        diff = list(difflib.unified_diff(old_s, new_s, lineterm=''))
        added = [l[1:] for l in diff if l.startswith('+') and not l.startswith('+++')]
        removed = [l[1:] for l in diff if l.startswith('-') and not l.startswith('---')]
        total = max(len(old), len(new), 1)
        ratio = (len(''.join(added)) + len(''.join(removed))) / total
        if ratio > 0.5: t = 'major_rewrite'
        elif ratio > 0.1: t = 'significant'
        elif ratio > 0.01: t = 'minor'
        else: t = 'trivial'
        return {'type': t, 'ratio': round(ratio, 4),
                'summary': f"+{len(added)}/-{len(removed)} sentences"}
Enter fullscreen mode Exit fullscreen mode

Alert System

class Alerts:
    def __init__(self):
        self.webhooks = []

    def add_webhook(self, url):
        self.webhooks.append(url)

    def send(self, data):
        msg = f"Change: {data.get('url','')} | {data.get('type','')} | {data.get('summary','')}"
        for wh in self.webhooks:
            try:
                requests.post(wh, json={'text': msg}, timeout=10)
            except: pass
Enter fullscreen mode Exit fullscreen mode

Running the Monitor

class Runner:
    def __init__(self, monitor, alerts):
        self.monitor = monitor
        self.alerts = alerts

    def cycle(self):
        cursor = self.monitor.db.execute('''
            SELECT url, selector FROM pages
            WHERE last_checked IS NULL
            OR (julianday('now')-julianday(last_checked))*86400 > interval
        ''')
        changes = 0
        for url, sel in cursor.fetchall():
            change = self.monitor._check(url, sel)
            if change and 'error' not in change and change.get('type') != 'trivial':
                change['url'] = url
                self.alerts.send(change)
                changes += 1
            time.sleep(2)
        return changes

    def run_forever(self, interval=60):
        while True:
            n = self.cycle()
            print(f"[{datetime.now()}] {n} changes found")
            time.sleep(interval)

monitor = WebMonitor(api_key='YOUR_KEY')
alerts = Alerts()
alerts.add_webhook('https://hooks.slack.com/services/YOUR/WEBHOOK')
monitor.add_watch('https://example.com/pricing', '.pricing-table', 3600)
Runner(monitor, alerts).run_forever()
Enter fullscreen mode Exit fullscreen mode

For hundreds of pages, ScraperAPI handles JS rendering. ThorData provides proxy rotation. Monitor health with ScrapeOps.


Follow for more Python monitoring tutorials.

Top comments (0)