How to Monitor Website Changes and Get Alerts with Python
Websites change constantly. Automated change detection saves hours of manual checking whether you track pricing, regulatory updates, or stock availability.
Core Monitor
import requests
from bs4 import BeautifulSoup
import hashlib, json, sqlite3, difflib
from datetime import datetime
import time, re
class WebMonitor:
def __init__(self, db_path='monitor.db', api_key=None):
self.db = sqlite3.connect(db_path)
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'ChangeMonitor/1.0'})
self._init_db()
def _init_db(self):
self.db.executescript('''
CREATE TABLE IF NOT EXISTS pages (
url TEXT PRIMARY KEY, selector TEXT,
last_hash TEXT, last_content TEXT,
last_checked DATETIME, interval INTEGER DEFAULT 3600);
CREATE TABLE IF NOT EXISTS changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT, old TEXT, new TEXT,
change_type TEXT, detected_at DATETIME, summary TEXT);
''')
def _fetch(self, url):
if self.api_key:
return self.session.get(
f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true")
return self.session.get(url)
def add_watch(self, url, selector=None, interval=3600):
self.db.execute('INSERT OR REPLACE INTO pages (url,selector,interval) VALUES (?,?,?)',
(url, selector, interval))
self.db.commit()
self._check(url, selector)
Smart Change Detection
def _extract(self, html, selector=None):
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.select('script, style, noscript, iframe'):
tag.decompose()
if selector:
els = soup.select(selector)
content = '\n'.join(e.get_text(strip=True) for e in els)
else:
content = soup.get_text(strip=True)
return re.sub(r'\s+', ' ', content).strip()
def _check(self, url, selector=None):
try:
resp = self._fetch(url)
content = self._extract(resp.text, selector)
h = hashlib.sha256(content.encode()).hexdigest()
row = self.db.execute(
'SELECT last_hash, last_content FROM pages WHERE url=?', (url,)
).fetchone()
if row and row[0] and row[0] != h:
change = self._analyze(row[1] or '', content)
self.db.execute(
'INSERT INTO changes (url,old,new,change_type,detected_at,summary) VALUES (?,?,?,?,?,?)',
(url, (row[1] or '')[:1000], content[:1000], change['type'],
datetime.now().isoformat(), change['summary']))
self.db.execute('UPDATE pages SET last_hash=?,last_content=?,last_checked=? WHERE url=?',
(h, content, datetime.now().isoformat(), url))
self.db.commit()
return change
self.db.execute('UPDATE pages SET last_hash=?,last_content=?,last_checked=? WHERE url=?',
(h, content, datetime.now().isoformat(), url))
self.db.commit()
return None
except Exception as e:
return {'error': str(e)}
def _analyze(self, old, new):
old_s = old.split('. ')
new_s = new.split('. ')
diff = list(difflib.unified_diff(old_s, new_s, lineterm=''))
added = [l[1:] for l in diff if l.startswith('+') and not l.startswith('+++')]
removed = [l[1:] for l in diff if l.startswith('-') and not l.startswith('---')]
total = max(len(old), len(new), 1)
ratio = (len(''.join(added)) + len(''.join(removed))) / total
if ratio > 0.5: t = 'major_rewrite'
elif ratio > 0.1: t = 'significant'
elif ratio > 0.01: t = 'minor'
else: t = 'trivial'
return {'type': t, 'ratio': round(ratio, 4),
'summary': f"+{len(added)}/-{len(removed)} sentences"}
Alert System
class Alerts:
def __init__(self):
self.webhooks = []
def add_webhook(self, url):
self.webhooks.append(url)
def send(self, data):
msg = f"Change: {data.get('url','')} | {data.get('type','')} | {data.get('summary','')}"
for wh in self.webhooks:
try:
requests.post(wh, json={'text': msg}, timeout=10)
except: pass
Running the Monitor
class Runner:
def __init__(self, monitor, alerts):
self.monitor = monitor
self.alerts = alerts
def cycle(self):
cursor = self.monitor.db.execute('''
SELECT url, selector FROM pages
WHERE last_checked IS NULL
OR (julianday('now')-julianday(last_checked))*86400 > interval
''')
changes = 0
for url, sel in cursor.fetchall():
change = self.monitor._check(url, sel)
if change and 'error' not in change and change.get('type') != 'trivial':
change['url'] = url
self.alerts.send(change)
changes += 1
time.sleep(2)
return changes
def run_forever(self, interval=60):
while True:
n = self.cycle()
print(f"[{datetime.now()}] {n} changes found")
time.sleep(interval)
monitor = WebMonitor(api_key='YOUR_KEY')
alerts = Alerts()
alerts.add_webhook('https://hooks.slack.com/services/YOUR/WEBHOOK')
monitor.add_watch('https://example.com/pricing', '.pricing-table', 3600)
Runner(monitor, alerts).run_forever()
For hundreds of pages, ScraperAPI handles JS rendering. ThorData provides proxy rotation. Monitor health with ScrapeOps.
Follow for more Python monitoring tutorials.
Top comments (0)