Media monitoring, tracking brand mentions, competitor coverage, and industry news, is a service companies pay thousands for monthly. Here's how to build your own with Python and web scraping.
What We'll Build
A tool that:
- Monitors multiple news sources for keyword mentions
- Extracts article metadata (title, date, author, snippet)
- Deduplicates across sources
- Sends alerts for new mentions
Setup
pip install requests beautifulsoup4 pandas feedparser newspaper3k
RSS Feed Monitor
The fastest approach starts with RSS feeds, as many news sites still publish them:
import feedparser
from datetime import datetime
class RSSMonitor:
def __init__(self, keywords):
self.keywords = [k.lower() for k in keywords]
self.seen_urls = set()
def check_feed(self, feed_url, source_name):
"""Check an RSS feed for keyword mentions."""
feed = feedparser.parse(feed_url)
mentions = []
for entry in feed.entries:
title = entry.get("title", "")
summary = entry.get("summary", "")
link = entry.get("link", "")
if link in self.seen_urls:
continue
text = f"{title} {summary}".lower()
matched = [k for k in self.keywords if k in text]
if matched:
self.seen_urls.add(link)
mentions.append({
"source": source_name,
"title": title,
"url": link,
"summary": summary[:300],
"keywords_matched": matched,
"published": entry.get("published", ""),
"found_at": datetime.now().isoformat()
})
return mentions
Web Scraping for Sites Without RSS
import requests
from bs4 import BeautifulSoup
SCRAPER_API_KEY = "YOUR_KEY"
class WebMonitor:
def __init__(self, keywords):
self.keywords = [k.lower() for k in keywords]
self.seen_urls = set()
def scrape_news_site(self, base_url, source_name, article_selector, title_selector):
"""Scrape a news site for articles matching keywords."""
api_url = (
f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}"
f"&url={base_url}"
)
response = requests.get(api_url, timeout=30)
soup = BeautifulSoup(response.text, "html.parser")
mentions = []
for article in soup.select(article_selector):
title_el = article.select_one(title_selector)
link_el = article.select_one("a[href]")
if not title_el or not link_el:
continue
title = title_el.text.strip()
url = link_el["href"]
if not url.startswith("http"):
url = base_url.rstrip("/") + url
if url in self.seen_urls:
continue
matched = [k for k in self.keywords if k in title.lower()]
if matched:
self.seen_urls.add(url)
mentions.append({
"source": source_name,
"title": title,
"url": url,
"keywords_matched": matched,
"found_at": datetime.now().isoformat()
})
return mentions
Full Article Extraction
from newspaper import Article
def extract_full_article(url):
"""Extract full article text and metadata."""
try:
article = Article(url)
article.download()
article.parse()
return {
"title": article.title,
"authors": article.authors,
"publish_date": str(article.publish_date) if article.publish_date else "",
"text": article.text,
"top_image": article.top_image,
"word_count": len(article.text.split())
}
except Exception as e:
return {"error": str(e)}
The Monitoring Pipeline
import time
import json
NEWS_FEEDS = [
("TechCrunch", "https://techcrunch.com/feed/"),
("The Verge", "https://www.theverge.com/rss/index.xml"),
("Ars Technica", "https://feeds.arstechnica.com/arstechnica/index"),
]
NEWS_SITES = [
("Reuters Tech", "https://www.reuters.com/technology/", "article", "h3"),
]
def run_monitoring(keywords, interval_minutes=30):
rss = RSSMonitor(keywords)
web = WebMonitor(keywords)
all_mentions = []
while True:
print(f"\n[{datetime.now()}] Checking sources...")
for name, url in NEWS_FEEDS:
mentions = rss.check_feed(url, name)
all_mentions.extend(mentions)
for m in mentions:
print(f" NEW: [{m['source']}] {m['title']}")
for name, url, art_sel, title_sel in NEWS_SITES:
try:
mentions = web.scrape_news_site(url, name, art_sel, title_sel)
all_mentions.extend(mentions)
for m in mentions:
print(f" NEW: [{m['source']}] {m['title']}")
except Exception as e:
print(f" Error with {name}: {e}")
time.sleep(3)
with open("mentions.json", "w") as f:
json.dump(all_mentions, f, indent=2)
print(f"Total mentions: {len(all_mentions)}")
time.sleep(interval_minutes * 60)
# Run it
run_monitoring(["openai", "anthropic", "google ai", "llm"])
Scaling Considerations
For production media monitoring, ScraperAPI handles proxy rotation across news sites reliably. ThorData residential proxies help with geo-restricted content. Track your scraper health metrics with ScrapeOps.
Conclusion
A custom media monitoring tool gives you real-time awareness without expensive SaaS subscriptions. Start with RSS feeds for speed, add web scraping for coverage, and build alerts from there.
Top comments (0)