Most competitive intelligence workflows look like this: open browser, visit competitor site, manually check for changes, copy data to spreadsheet, repeat tomorrow. This guide automates that entire workflow with an AI agent that runs on a schedule.
The agent: scrapes multiple competitor pages daily, uses Claude to summarize what changed, and sends you a Telegram or Slack message with only the information that matters.
What the Agent Does
- Scrapes a list of URLs on a schedule
- Detects changes by comparing to yesterday's snapshot
- Summarizes changes in plain English using Claude
- Filters noise — only notifies you when something materially changed
- Delivers a daily digest to Telegram or Slack
This replaces about 2 hours of daily manual monitoring.
Project Structure
research_agent/
├── agent.py # Main orchestration
├── scraper.py # Page fetching and change detection
├── summarizer.py # Claude-powered summarization
├── notifier.py # Telegram/Slack delivery
├── storage.py # SQLite snapshots
├── config.yaml # Target URLs and settings
└── requirements.txt
Step 1: Storage (Page Snapshots)
# storage.py
import sqlite3
import hashlib
from datetime import datetime
from typing import Optional
class SnapshotDB:
def __init__(self, db_path: str = "snapshots.db"):
self.conn = sqlite3.connect(db_path, check_same_thread=False)
self._init_db()
def _init_db(self):
self.conn.execute("""
CREATE TABLE IF NOT EXISTS snapshots (
url TEXT,
content_hash TEXT,
content TEXT,
scraped_at TIMESTAMP,
PRIMARY KEY (url, scraped_at)
)
""")
self.conn.execute("""
CREATE INDEX IF NOT EXISTS idx_url_time
ON snapshots(url, scraped_at DESC)
""")
self.conn.commit()
def save_snapshot(self, url: str, content: str) -> str:
"""Save snapshot, return content hash."""
hash_ = hashlib.sha256(content.encode()).hexdigest()[:16]
self.conn.execute(
"INSERT INTO snapshots VALUES (?, ?, ?, ?)",
(url, hash_, content, datetime.utcnow())
)
self.conn.commit()
return hash_
def get_previous(self, url: str, skip_latest: bool = True) -> Optional[str]:
"""Get the snapshot before the most recent one."""
cursor = self.conn.execute(
"SELECT content FROM snapshots WHERE url=? ORDER BY scraped_at DESC LIMIT ?",
(url, 2 if skip_latest else 1)
)
rows = cursor.fetchall()
if skip_latest and len(rows) >= 2:
return rows[1][0]
elif not skip_latest and rows:
return rows[0][0]
return None
def get_latest_hash(self, url: str) -> Optional[str]:
cursor = self.conn.execute(
"SELECT content_hash FROM snapshots WHERE url=? ORDER BY scraped_at DESC LIMIT 1",
(url,)
)
row = cursor.fetchone()
return row[0] if row else None
Step 2: Scraper
# scraper.py
import httpx
from selectolax.parser import HTMLParser
import asyncio
class PageScraper:
def __init__(self):
self.client = httpx.AsyncClient(
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0",
"Accept-Language": "en-US,en;q=0.5",
},
follow_redirects=True,
timeout=20
)
async def fetch_text(self, url: str) -> str:
"""Fetch page and return cleaned text content."""
try:
resp = await self.client.get(url)
resp.raise_for_status()
return self._clean(resp.text)
except Exception as e:
return f"FETCH_ERROR: {e}"
def _clean(self, html: str) -> str:
"""Extract meaningful text, remove boilerplate."""
tree = HTMLParser(html)
for tag in tree.css("script, style, nav, footer, header, .cookie-banner"):
tag.decompose()
# Try content-specific areas first
for selector in ["main", "article", ".content", "#main-content"]:
node = tree.css_first(selector)
if node:
return node.text(separator="\n", strip=True)
body = tree.body
return body.text(separator="\n", strip=True) if body else ""
def has_changed(self, url: str, current_content: str, db: 'SnapshotDB') -> bool:
"""Check if content changed since last snapshot."""
import hashlib
current_hash = hashlib.sha256(current_content.encode()).hexdigest()[:16]
previous_hash = db.get_latest_hash(url)
return previous_hash != current_hash
async def close(self):
await self.client.aclose()
Step 3: AI Summarizer
# summarizer.py
import anthropic
from typing import Optional
class ChangeSummarizer:
def __init__(self, model: str = "claude-3-haiku-20240307"):
self.client = anthropic.Anthropic()
self.model = model
def summarize_change(self, url: str, before: str, after: str,
context: str = "") -> Optional[str]:
"""Use Claude to summarize what changed between two versions."""
# Limit size to control cost
before_trunc = before[:3000]
after_trunc = after[:3000]
prompt = f"""Compare these two versions of a webpage and summarize what changed.
URL: {url}
{f'Context: {context}' if context else ''}
BEFORE:
{before_trunc}
AFTER:
{after_trunc}
Instructions:
- Focus only on meaningful business/content changes (pricing, features, team changes, new products)
- Ignore trivial changes (whitespace, dates in timestamps, view counts)
- If nothing meaningful changed, respond with exactly: "NO_MATERIAL_CHANGE"
- Keep summary to 2-3 sentences maximum
- Be specific: mention exact numbers, names, or features that changed"""
message = self.client.messages.create(
model=self.model,
max_tokens=256,
messages=[{"role": "user", "content": prompt}]
)
summary = message.content[0].text.strip()
if summary == "NO_MATERIAL_CHANGE":
return None
return summary
def summarize_new_page(self, url: str, content: str,
what_to_watch: str) -> str:
"""Summarize a newly added URL for the first time."""
prompt = f"""Summarize the key information from this webpage that's relevant to: {what_to_watch}
URL: {url}
Content: {content[:4000]}
Return a 2-3 sentence summary focusing on: {what_to_watch}"""
message = self.client.messages.create(
model=self.model,
max_tokens=256,
messages=[{"role": "user", "content": prompt}]
)
return message.content[0].text.strip()
Step 4: Notifier
# notifier.py
import httpx
import os
class TelegramNotifier:
def __init__(self, token: str, chat_id: str):
self.token = token
self.chat_id = chat_id
self.base = f"https://api.telegram.org/bot{token}"
def send(self, message: str):
httpx.post(f"{self.base}/sendMessage", json={
"chat_id": self.chat_id,
"text": message,
"parse_mode": "HTML",
"disable_web_page_preview": True
})
class SlackNotifier:
def __init__(self, webhook_url: str):
self.webhook_url = webhook_url
def send(self, message: str):
httpx.post(self.webhook_url, json={"text": message})
Step 5: Main Agent
# agent.py
import asyncio
import yaml
from datetime import datetime
from scraper import PageScraper
from summarizer import ChangeSummarizer
from storage import SnapshotDB
from notifier import TelegramNotifier
async def run_agent():
# Load config
with open("config.yaml") as f:
config = yaml.safe_load(f)
db = SnapshotDB()
scraper = PageScraper()
summarizer = ChangeSummarizer(model=config.get("model", "claude-3-haiku-20240307"))
notifier = TelegramNotifier(
token=config["telegram_token"],
chat_id=config["telegram_chat_id"]
)
changes = []
for target in config["targets"]:
url = target["url"]
context = target.get("watch_for", "")
print(f"Checking: {url}")
# Fetch current content
current = await scraper.fetch_text(url)
if "FETCH_ERROR" in current:
print(f" Error: {current}")
continue
# Get previous snapshot
previous = db.get_previous(url, skip_latest=False)
if previous is None:
# First time seeing this URL
summary = summarizer.summarize_new_page(url, current, context)
db.save_snapshot(url, current)
changes.append(f"📍 <b>New monitor: {target['name']}</b>\n{summary}\n{url}")
elif scraper.has_changed(url, current, db):
# Content changed - get AI summary
summary = summarizer.summarize_change(url, previous, current, context)
db.save_snapshot(url, current)
if summary: # None means "no material change"
changes.append(f"🔔 <b>{target['name']}</b>\n{summary}\n<a href='{url}'>View</a>")
else:
# No change detected
db.save_snapshot(url, current)
await scraper.close()
# Send digest
if changes:
date = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
message = f"<b>🤖 Research Agent Digest — {date}</b>\n\n"
message += "\n\n".join(changes)
notifier.send(message)
print(f"Sent digest with {len(changes)} changes")
else:
print("No material changes detected")
asyncio.run(run_agent())
Config File
# config.yaml
model: claude-3-haiku-20240307
telegram_token: "your-bot-token"
telegram_chat_id: "your-chat-id"
targets:
- name: "Competitor A — Pricing Page"
url: "https://competitor-a.com/pricing"
watch_for: "pricing, plan names, feature limits"
- name: "Competitor B — Jobs"
url: "https://competitor-b.com/jobs"
watch_for: "new job postings, especially engineering and sales roles"
- name: "Industry News — TechCrunch Tag"
url: "https://techcrunch.com/tag/your-industry/"
watch_for: "new funding rounds, acquisitions, product launches"
- name: "Competitor A — Changelog"
url: "https://competitor-a.com/changelog"
watch_for: "new features, API changes, deprecations"
Running on a Schedule
# crontab -e
# Run every day at 8am UTC
0 8 * * * cd /opt/research-agent && python agent.py >> logs/agent.log 2>&1
Or with Docker:
# docker-compose.yml
services:
agent:
build: .
command: python agent.py
volumes:
- ./snapshots.db:/app/snapshots.db
environment:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
restart: unless-stopped
scheduler:
image: mcuadros/ofelia:latest
command: daemon --docker
labels:
ofelia.enabled: "true"
ofelia.job-exec.agent.schedule: "@daily"
ofelia.job-exec.agent.command: "python agent.py"
Cost at Scale
Monitoring 20 competitor pages daily with Claude Haiku:
- Per page: ~1,500 tokens in + 200 tokens out ≈ $0.0004
- 20 pages × $0.0004 = $0.008/day
- Monthly: ~$0.24
Less than the cost of a cup of coffee per month for continuous competitive intelligence.
Data Collection Layer
If you want structured data (pricing tables, job listings, product specs) rather than raw text summaries, use pre-built scrapers as the data collection layer in your agent pipeline.
Apify Scrapers Bundle — €29 — 35 production actors for structured data extraction. Feed their output into Claude for analysis and alerting.
Top comments (0)