Media monitoring, tracking brand mentions, competitor coverage, and industry news, is a service companies pay thousands for monthly. Here's how to build your own with Python and web scraping.
What We'll Build
A tool that:
- Monitors multiple news sources for keyword mentions
- Extracts article metadata (title, date, author, snippet)
- Deduplicates across sources
- Sends alerts for new mentions
Setup
pip install requests beautifulsoup4 pandas feedparser newspaper3k
RSS Feed Monitor
The fastest approach starts with RSS feeds, as many news sites still publish them:
import feedparser
from datetime import datetime
class RSSMonitor:
def __init__(self, keywords):
self.keywords = [k.lower() for k in keywords]
self.seen_urls = set()
def check_feed(self, feed_url, source_name):
"""Check an RSS feed for keyword mentions."""
feed = feedparser.parse(feed_url)
mentions = []
for entry in feed.entries:
title = entry.get("title", "")
summary = entry.get("summary", "")
link = entry.get("link", "")
if link in self.seen_urls:
continue
text = f"{title} {summary}".lower()
matched = [k for k in self.keywords if k in text]
if matched:
self.seen_urls.add(link)
mentions.append({
"source": source_name,
"title": title,
"url": link,
"summary": summary[:300],
"keywords_matched": matched,
"published": entry.get("published", ""),
"found_at": datetime.now().isoformat()
})
return mentions
Web Scraping for Sites Without RSS
import requests
from bs4 import BeautifulSoup
SCRAPER_API_KEY = "YOUR_KEY"
class WebMonitor:
def __init__(self, keywords):
self.keywords = [k.lower() for k in keywords]
self.seen_urls = set()
def scrape_news_site(self, base_url, source_name, article_selector, title_selector):
"""Scrape a news site for articles matching keywords."""
api_url = (
f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}"
f"&url={base_url}"
)
response = requests.get(api_url, timeout=30)
soup = BeautifulSoup(response.text, "html.parser")
mentions = []
for article in soup.select(article_selector):
title_el = article.select_one(title_selector)
link_el = article.select_one("a[href]")
if not title_el or not link_el:
continue
title = title_el.text.strip()
url = link_el["href"]
if not url.startswith("http"):
url = base_url.rstrip("/") + url
if url in self.seen_urls:
continue
matched = [k for k in self.keywords if k in title.lower()]
if matched:
self.seen_urls.add(url)
mentions.append({
"source": source_name,
"title": title,
"url": url,
"keywords_matched": matched,
"found_at": datetime.now().isoformat()
})
return mentions
Full Article Extraction
from newspaper import Article
def extract_full_article(url):
"""Extract full article text and metadata."""
try:
article = Article(url)
article.download()
article.parse()
return {
"title": article.title,
"authors": article.authors,
"publish_date": str(article.publish_date) if article.publish_date else "",
"text": article.text,
"top_image": article.top_image,
"word_count": len(article.text.split())
}
except Exception as e:
return {"error": str(e)}
The Monitoring Pipeline
import time
import json
NEWS_FEEDS = [
("TechCrunch", "https://techcrunch.com/feed/"),
("The Verge", "https://www.theverge.com/rss/index.xml"),
("Ars Technica", "https://feeds.arstechnica.com/arstechnica/index"),
]
NEWS_SITES = [
("Reuters Tech", "https://www.reuters.com/technology/", "article", "h3"),
]
def run_monitoring(keywords, interval_minutes=30):
rss = RSSMonitor(keywords)
web = WebMonitor(keywords)
all_mentions = []
while True:
print(f"\n[{datetime.now()}] Checking sources...")
for name, url in NEWS_FEEDS:
mentions = rss.check_feed(url, name)
all_mentions.extend(mentions)
for m in mentions:
print(f" NEW: [{m['source']}] {m['title']}")
for name, url, art_sel, title_sel in NEWS_SITES:
try:
mentions = web.scrape_news_site(url, name, art_sel, title_sel)
all_mentions.extend(mentions)
for m in mentions:
print(f" NEW: [{m['source']}] {m['title']}")
except Exception as e:
print(f" Error with {name}: {e}")
time.sleep(3)
with open("mentions.json", "w") as f:
json.dump(all_mentions, f, indent=2)
print(f"Total mentions: {len(all_mentions)}")
time.sleep(interval_minutes * 60)
# Run it
run_monitoring(["openai", "anthropic", "google ai", "llm"])
Scaling Considerations
For production media monitoring, ScraperAPI handles proxy rotation across news sites reliably. ThorData residential proxies help with geo-restricted content. Track your scraper health metrics with ScrapeOps.
Conclusion
A custom media monitoring tool gives you real-time awareness without expensive SaaS subscriptions. Start with RSS feeds for speed, add web scraping for coverage, and build alerts from there.
Top comments (1)
Some comments may only be visible to logged-in visitors. Sign in to view all comments.