How to Build a Brand Safety Monitor with Web Scraping

#python #tutorial #webdev #programming

Brand safety is a $500M+ industry problem. When your ads appear next to harmful content or your brand gets mentioned in negative contexts, the damage can be immediate. Here's how to build your own monitoring system.

What Brand Safety Monitoring Covers

Ad placement verification -- are your ads showing on safe pages?
Brand mention sentiment -- what context surrounds your brand name?
Competitor defamation detection -- is someone attacking your brand?
Domain spoofing alerts -- are fake sites using your brand?

Core Monitor Implementation

pip install requests beautifulsoup4 textblob pandas

import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from datetime import datetime

class BrandSafetyMonitor:
    def __init__(self, brand_name, api_key):
        self.brand = brand_name
        self.api_key = api_key
        self.unsafe_categories = [
            "violence", "hate speech", "adult content",
            "illegal", "misinformation", "extremism"
        ]

    def search_brand_mentions(self, search_url):
        proxy = f"http://api.scraperapi.com?api_key={self.api_key}&url={search_url}"
        resp = requests.get(proxy, timeout=30)
        soup = BeautifulSoup(resp.text, "html.parser")

        mentions = []
        for result in soup.select(".result, .search-result, article"):
            title = result.select_one("h2, h3, .title")
            snippet = result.select_one("p, .snippet, .description")
            link = result.select_one("a")

            if title and self.brand.lower() in (title.text + (snippet.text if snippet else "")).lower():
                text = f"{title.text} {snippet.text if snippet else ''}"
                sentiment = TextBlob(text).sentiment

                mentions.append({
                    "title": title.text.strip(),
                    "snippet": snippet.text.strip() if snippet else "",
                    "url": link.get("href", "") if link else "",
                    "sentiment": round(sentiment.polarity, 2),
                    "is_negative": sentiment.polarity < -0.2,
                    "timestamp": str(datetime.now())
                })
        return mentions

    def check_page_safety(self, url):
        proxy = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}"
        resp = requests.get(proxy, timeout=30)
        soup = BeautifulSoup(resp.text, "html.parser")
        page_text = soup.get_text().lower()

        flags = []
        keyword_map = {
            "violence": ["murder", "assault", "shooting"],
            "hate speech": ["racist", "bigot", "supremacist"],
            "misinformation": ["hoax", "conspiracy", "fake news"],
        }
        for category, keywords in keyword_map.items():
            matches = [kw for kw in keywords if kw in page_text]
            if matches:
                flags.append({"category": category, "matches": matches})

        return {"url": url, "is_safe": len(flags) == 0, "flags": flags}

    def detect_domain_spoofing(self, legit_domain):
        name = legit_domain.split(".")[0]
        variations = [f"{name}s.com", f"{name}-official.com", f"get{name}.com"]
        spoofed = []
        for domain in variations:
            try:
                resp = requests.head(f"http://{domain}", timeout=5)
                if resp.status_code < 400:
                    spoofed.append(domain)
            except requests.RequestException:
                pass
        return spoofed

# Usage
monitor = BrandSafetyMonitor("YourBrand", "YOUR_SCRAPERAPI_KEY")
mentions = monitor.search_brand_mentions("https://news.ycombinator.com/newest")
negative = [m for m in mentions if m["is_negative"]]
print(f"Found {len(mentions)} mentions, {len(negative)} negative")

Proxy Infrastructure

Brand monitoring requires checking many diverse sites. ScraperAPI handles the complexity of different anti-bot systems. For monitoring across geographies, ThorData provides location-specific proxies. Track all monitoring jobs with ScrapeOps.

Automated Alerting

import schedule

def hourly_brand_check():
    monitor = BrandSafetyMonitor("YourBrand", "YOUR_KEY")
    sources = ["https://news.ycombinator.com/newest"]
    for source in sources:
        mentions = monitor.search_brand_mentions(source)
        critical = [m for m in mentions if m["sentiment"] < -0.5]
        if critical:
            send_urgent_alert(critical)

schedule.every().hour.do(hourly_brand_check)

Conclusion

Brand safety monitoring protects your reputation 24/7. Combine mention tracking, page safety analysis, and domain spoofing detection for comprehensive coverage. Start with your most important channels and expand from there.

DEV Community