DEV Community

agenthustler
agenthustler

Posted on

Building a Media Monitoring Tool: Mentions Across News Sites

Media monitoring, tracking brand mentions, competitor coverage, and industry news, is a service companies pay thousands for monthly. Here's how to build your own with Python and web scraping.

What We'll Build

A tool that:

  • Monitors multiple news sources for keyword mentions
  • Extracts article metadata (title, date, author, snippet)
  • Deduplicates across sources
  • Sends alerts for new mentions

Setup

pip install requests beautifulsoup4 pandas feedparser newspaper3k
Enter fullscreen mode Exit fullscreen mode

RSS Feed Monitor

The fastest approach starts with RSS feeds, as many news sites still publish them:

import feedparser
from datetime import datetime

class RSSMonitor:
    def __init__(self, keywords):
        self.keywords = [k.lower() for k in keywords]
        self.seen_urls = set()

    def check_feed(self, feed_url, source_name):
        """Check an RSS feed for keyword mentions."""
        feed = feedparser.parse(feed_url)
        mentions = []

        for entry in feed.entries:
            title = entry.get("title", "")
            summary = entry.get("summary", "")
            link = entry.get("link", "")

            if link in self.seen_urls:
                continue

            text = f"{title} {summary}".lower()
            matched = [k for k in self.keywords if k in text]

            if matched:
                self.seen_urls.add(link)
                mentions.append({
                    "source": source_name,
                    "title": title,
                    "url": link,
                    "summary": summary[:300],
                    "keywords_matched": matched,
                    "published": entry.get("published", ""),
                    "found_at": datetime.now().isoformat()
                })

        return mentions
Enter fullscreen mode Exit fullscreen mode

Web Scraping for Sites Without RSS

import requests
from bs4 import BeautifulSoup

SCRAPER_API_KEY = "YOUR_KEY"

class WebMonitor:
    def __init__(self, keywords):
        self.keywords = [k.lower() for k in keywords]
        self.seen_urls = set()

    def scrape_news_site(self, base_url, source_name, article_selector, title_selector):
        """Scrape a news site for articles matching keywords."""
        api_url = (
            f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}"
            f"&url={base_url}"
        )
        response = requests.get(api_url, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")

        mentions = []
        for article in soup.select(article_selector):
            title_el = article.select_one(title_selector)
            link_el = article.select_one("a[href]")

            if not title_el or not link_el:
                continue

            title = title_el.text.strip()
            url = link_el["href"]
            if not url.startswith("http"):
                url = base_url.rstrip("/") + url

            if url in self.seen_urls:
                continue

            matched = [k for k in self.keywords if k in title.lower()]

            if matched:
                self.seen_urls.add(url)
                mentions.append({
                    "source": source_name,
                    "title": title,
                    "url": url,
                    "keywords_matched": matched,
                    "found_at": datetime.now().isoformat()
                })

        return mentions
Enter fullscreen mode Exit fullscreen mode

Full Article Extraction

from newspaper import Article

def extract_full_article(url):
    """Extract full article text and metadata."""
    try:
        article = Article(url)
        article.download()
        article.parse()

        return {
            "title": article.title,
            "authors": article.authors,
            "publish_date": str(article.publish_date) if article.publish_date else "",
            "text": article.text,
            "top_image": article.top_image,
            "word_count": len(article.text.split())
        }
    except Exception as e:
        return {"error": str(e)}
Enter fullscreen mode Exit fullscreen mode

The Monitoring Pipeline

import time
import json

NEWS_FEEDS = [
    ("TechCrunch", "https://techcrunch.com/feed/"),
    ("The Verge", "https://www.theverge.com/rss/index.xml"),
    ("Ars Technica", "https://feeds.arstechnica.com/arstechnica/index"),
]

NEWS_SITES = [
    ("Reuters Tech", "https://www.reuters.com/technology/", "article", "h3"),
]

def run_monitoring(keywords, interval_minutes=30):
    rss = RSSMonitor(keywords)
    web = WebMonitor(keywords)
    all_mentions = []

    while True:
        print(f"\n[{datetime.now()}] Checking sources...")

        for name, url in NEWS_FEEDS:
            mentions = rss.check_feed(url, name)
            all_mentions.extend(mentions)
            for m in mentions:
                print(f"  NEW: [{m['source']}] {m['title']}")

        for name, url, art_sel, title_sel in NEWS_SITES:
            try:
                mentions = web.scrape_news_site(url, name, art_sel, title_sel)
                all_mentions.extend(mentions)
                for m in mentions:
                    print(f"  NEW: [{m['source']}] {m['title']}")
            except Exception as e:
                print(f"  Error with {name}: {e}")
            time.sleep(3)

        with open("mentions.json", "w") as f:
            json.dump(all_mentions, f, indent=2)

        print(f"Total mentions: {len(all_mentions)}")
        time.sleep(interval_minutes * 60)

# Run it
run_monitoring(["openai", "anthropic", "google ai", "llm"])
Enter fullscreen mode Exit fullscreen mode

Scaling Considerations

For production media monitoring, ScraperAPI handles proxy rotation across news sites reliably. ThorData residential proxies help with geo-restricted content. Track your scraper health metrics with ScrapeOps.

Conclusion

A custom media monitoring tool gives you real-time awareness without expensive SaaS subscriptions. Start with RSS feeds for speed, add web scraping for coverage, and build alerts from there.

Top comments (0)