agenthustler

Posted on Mar 26

How to Build a Brand Mention Tracker with Web Scraping

#python #tutorial #webdev #programming

Knowing when and where your brand gets mentioned online is crucial for reputation management, competitive analysis, and PR. Commercial tools like Mention or Brand24 charge hundreds per month. Let us build our own brand mention tracker with Python.

Why Track Brand Mentions?

Reputation management — respond to negative mentions before they spiral
PR monitoring — know when media covers your brand
Competitive intelligence — track competitor mentions too
Customer feedback — find unsolicited reviews and opinions
Link building — find unlinked mentions for SEO outreach

Architecture Overview

Our tracker will:

Monitor Google search results for brand mentions
Scrape news sites and forums
Check social media platforms
Analyze sentiment of mentions
Send alerts for new or negative mentions

Setting Up

pip install requests beautifulsoup4 pandas textblob schedule

Core Brand Mention Scraper

import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import hashlib

@dataclass
class BrandMention:
    brand: str
    source: str
    title: str
    url: str
    snippet: str
    sentiment: float
    found_at: str
    mention_hash: str = ""

    def __post_init__(self):
        self.mention_hash = hashlib.md5(f"{self.url}{self.title}".encode()).hexdigest()

class BrandTracker:
    def __init__(self, brand_name, api_key):
        self.brand = brand_name
        self.api_key = api_key
        self.mentions = []
        self.seen_hashes = set()

    def fetch(self, url):
        proxy_url = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}"
        response = requests.get(proxy_url, timeout=30)
        return response

Google News Monitoring

from urllib.parse import quote_plus

def search_google_news(self, days_back=7):
    query = quote_plus(f"\"{self.brand}\" when:{days_back}d")
    url = f"https://www.google.com/search?q={query}&tbm=nws"

    response = self.fetch(url)
    soup = BeautifulSoup(response.text, "html.parser")

    mentions = []
    results = soup.find_all("div", class_="SoaBEf") or soup.find_all("div", class_="g")

    for result in results:
        title_el = result.find("div", class_="MBeuO") or result.find("h3")
        link_el = result.find("a")
        snippet_el = result.find("div", class_="GI74Re") or result.find("span", class_="aCOpRe")
        source_el = result.find("div", class_="CEMjEf") or result.find("span", class_="UPmit")

        if title_el:
            mention = BrandMention(
                brand=self.brand,
                source=source_el.get_text(strip=True) if source_el else "Google News",
                title=title_el.get_text(strip=True),
                url=link_el["href"] if link_el else "",
                snippet=snippet_el.get_text(strip=True) if snippet_el else "",
                sentiment=0.0,
                found_at=datetime.now().isoformat()
            )
            mentions.append(mention)
    return mentions

BrandTracker.search_google_news = search_google_news

Reddit Monitoring

def search_reddit(self, subreddits=None, limit=25):
    query = quote_plus(self.brand)
    url = f"https://www.reddit.com/search.json?q={query}&sort=new&limit={limit}"

    headers = {"User-Agent": "BrandTracker/1.0"}
    response = requests.get(url, headers=headers, timeout=15)
    data = response.json()

    mentions = []
    for post in data.get("data", {}).get("children", []):
        post_data = post["data"]
        if subreddits and post_data["subreddit"].lower() not in [s.lower() for s in subreddits]:
            continue

        mentions.append(BrandMention(
            brand=self.brand,
            source=f"r/{post_data[subreddit]}",
            title=post_data["title"],
            url=f"https://reddit.com{post_data[permalink]}",
            snippet=post_data.get("selftext", "")[:300],
            sentiment=0.0,
            found_at=datetime.now().isoformat()
        ))
    return mentions

BrandTracker.search_reddit = search_reddit

Hacker News Monitoring

def search_hackernews(self):
    url = f"https://hn.algolia.com/api/v1/search_by_date?query={quote_plus(self.brand)}&tags=story"
    response = requests.get(url, timeout=15)
    data = response.json()

    mentions = []
    for hit in data.get("hits", []):
        mentions.append(BrandMention(
            brand=self.brand,
            source="Hacker News",
            title=hit.get("title", ""),
            url=hit.get("url", f"https://news.ycombinator.com/item?id={hit[objectID]}"),
            snippet=hit.get("story_text", "")[:300] if hit.get("story_text") else "",
            sentiment=0.0,
            found_at=hit.get("created_at", datetime.now().isoformat())
        ))
    return mentions

BrandTracker.search_hackernews = search_hackernews

Sentiment Analysis

from textblob import TextBlob

def analyze_mentions(self):
    for mention in self.mentions:
        text = f"{mention.title} {mention.snippet}"
        blob = TextBlob(text)
        mention.sentiment = round(blob.sentiment.polarity, 3)

    positive = [m for m in self.mentions if m.sentiment > 0.1]
    negative = [m for m in self.mentions if m.sentiment < -0.1]
    neutral = [m for m in self.mentions if -0.1 <= m.sentiment <= 0.1]

    return {
        "total": len(self.mentions),
        "positive": len(positive),
        "negative": len(negative),
        "neutral": len(neutral),
        "avg_sentiment": sum(m.sentiment for m in self.mentions) / len(self.mentions) if self.mentions else 0
    }

BrandTracker.analyze_mentions = analyze_mentions

Full Monitoring Pipeline

import time

def run_full_scan(self):
    print(f"Scanning for {self.brand} mentions...")

    news = self.search_google_news()
    print(f"  Google News: {len(news)} mentions")
    time.sleep(2)

    reddit = self.search_reddit()
    print(f"  Reddit: {len(reddit)} mentions")
    time.sleep(1)

    hn = self.search_hackernews()
    print(f"  Hacker News: {len(hn)} mentions")

    all_mentions = news + reddit + hn
    new_mentions = []
    for mention in all_mentions:
        if mention.mention_hash not in self.seen_hashes:
            self.seen_hashes.add(mention.mention_hash)
            new_mentions.append(mention)

    self.mentions.extend(new_mentions)
    print(f"  New mentions: {len(new_mentions)}")

    analysis = self.analyze_mentions()
    print(f"  Sentiment: +{analysis[positive]} / -{analysis[negative]} / ~{analysis[neutral]}")

    return new_mentions

BrandTracker.run_full_scan = run_full_scan

tracker = BrandTracker("ScraperAPI", api_key="YOUR_KEY")
new = tracker.run_full_scan()

Alerts for Negative Mentions

import smtplib
from email.mime.text import MIMEText

def send_alerts(mentions, threshold=-0.2, email="you@example.com"):
    negative = [m for m in mentions if m.sentiment < threshold]
    if not negative:
        return

    body = f"Found {len(negative)} negative mentions:\n\n"
    for m in negative:
        body += f"- [{m.source}] {m.title}\n  Sentiment: {m.sentiment}\n  {m.url}\n\n"

    print(f"ALERT: {len(negative)} negative mentions detected!")
    for m in negative:
        print(f"  {m.source}: {m.title} (sentiment: {m.sentiment})")

Automated Scheduling

import schedule

def daily_brand_monitor():
    brands = ["YourBrand", "CompetitorA", "CompetitorB"]
    for brand in brands:
        tracker = BrandTracker(brand, api_key="YOUR_KEY")
        new = tracker.run_full_scan()
        send_alerts(new)
        time.sleep(10)

schedule.every(4).hours.do(daily_brand_monitor)

while True:
    schedule.run_pending()
    time.sleep(60)

Exporting Data

import pandas as pd

def export_mentions(mentions, filename="brand_mentions.csv"):
    data = [vars(m) for m in mentions]
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

    summary = df.groupby("source").agg(
        count=("title", "count"),
        avg_sentiment=("sentiment", "mean")
    ).sort_values("count", ascending=False)

    print(summary)
    return df

export_mentions(tracker.mentions)

Scaling Your Tracker

When monitoring multiple brands across many sources:

ScraperAPI handles Google News scraping with automatic CAPTCHA solving and IP rotation
ThorData provides geo-targeted residential proxies for location-specific brand monitoring
ScrapeOps monitors your scraper fleet to ensure you never miss mentions due to blocked requests

Conclusion

A custom brand mention tracker gives you the same capabilities as expensive SaaS tools at a fraction of the cost. The combination of Google News, Reddit, and Hacker News covers the most important sources for tech brands. Add more sources as needed — Twitter/X, product review sites, or industry forums. The modular architecture makes it easy to extend.

Happy scraping!

DEV Community