DEV Community

agenthustler
agenthustler

Posted on

Building a News Sentiment Analyzer with Web Scraping

Building a News Sentiment Analyzer with Web Scraping

Sentiment analysis on news articles can predict market movements, track brand reputation, and identify trending narratives. Here is how to build one from scratch with Python.

What We Are Building

  1. News scraper that collects articles from multiple sources
  2. Sentiment engine using TextBlob and VADER
  3. Trend tracker that monitors sentiment over time
  4. Dashboard to visualize results

Setup

pip install requests beautifulsoup4 textblob vaderSentiment pandas matplotlib
python -m textblob.download_corpora
Enter fullscreen mode Exit fullscreen mode

The News Scraper

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time

class NewsScraper:
    def __init__(self, proxy_key=None):
        self.session = requests.Session()
        self.proxy_key = proxy_key
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        })

    def fetch(self, url):
        if self.proxy_key:
            api_url = f"http://api.scraperapi.com?api_key={self.proxy_key}&url={url}"
            return self.session.get(api_url, timeout=30)
        return self.session.get(url, timeout=15)

    def scrape_google_news(self, query, num_results=20):
        url = f"https://news.google.com/search?q={query}&hl=en-US&gl=US"
        response = self.fetch(url)
        soup = BeautifulSoup(response.text, "html.parser")

        articles = []
        for article in soup.select("article")[:num_results]:
            title_el = article.select_one("h3, h4")
            source_el = article.select_one("[data-n-tid]")
            time_el = article.select_one("time")
            link_el = article.select_one("a[href]")

            if title_el:
                articles.append({
                    "title": title_el.text.strip(),
                    "source": source_el.text.strip() if source_el else "Unknown",
                    "published": time_el.get("datetime", "") if time_el else "",
                    "url": "https://news.google.com" + link_el["href"][1:] if link_el else "",
                    "scraped_at": datetime.now().isoformat()
                })
        return articles

    def scrape_article_content(self, url):
        try:
            response = self.fetch(url)
            soup = BeautifulSoup(response.text, "html.parser")

            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()

            paragraphs = soup.find_all("p")
            content = " ".join(p.text.strip() for p in paragraphs if len(p.text.strip()) > 50)
            return content[:5000]
        except Exception:
            return ""
Enter fullscreen mode Exit fullscreen mode

Sentiment Analysis Engine

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer:
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()

    def analyze(self, text):
        if not text:
            return {"compound": 0, "label": "neutral", "confidence": 0}

        vader_scores = self.vader.polarity_scores(text)
        blob = TextBlob(text)

        compound = (vader_scores["compound"] + blob.sentiment.polarity) / 2

        if compound >= 0.05:
            label = "positive"
        elif compound <= -0.05:
            label = "negative"
        else:
            label = "neutral"

        confidence = abs(compound)

        return {
            "compound": round(compound, 4),
            "label": label,
            "confidence": round(confidence, 4),
            "vader": vader_scores,
            "textblob": {
                "polarity": round(blob.sentiment.polarity, 4),
                "subjectivity": round(blob.sentiment.subjectivity, 4)
            }
        }

    def analyze_batch(self, articles, scraper=None):
        results = []
        for article in articles:
            content = article.get("content", article.get("title", ""))
            if scraper and not article.get("content"):
                content = scraper.scrape_article_content(article.get("url", ""))
                time.sleep(1)

            sentiment = self.analyze(content)
            results.append({**article, "sentiment": sentiment})
        return results
Enter fullscreen mode Exit fullscreen mode

Trend Tracking

import pandas as pd
import matplotlib.pyplot as plt

class SentimentTracker:
    def __init__(self, data_file="sentiment_history.csv"):
        self.data_file = data_file

    def save_results(self, results):
        rows = []
        for r in results:
            rows.append({
                "title": r["title"],
                "source": r["source"],
                "compound": r["sentiment"]["compound"],
                "label": r["sentiment"]["label"],
                "scraped_at": r["scraped_at"]
            })

        df = pd.DataFrame(rows)
        df.to_csv(self.data_file, mode="a", header=not pd.io.common.file_exists(self.data_file), index=False)

    def plot_sentiment_trend(self, query):
        df = pd.read_csv(self.data_file)
        df["scraped_at"] = pd.to_datetime(df["scraped_at"])

        daily = df.groupby(df["scraped_at"].dt.date)["compound"].mean()

        plt.figure(figsize=(12, 6))
        plt.plot(daily.index, daily.values, marker="o")
        plt.axhline(y=0, color="gray", linestyle="--")
        plt.fill_between(daily.index, daily.values, alpha=0.3,
                        where=[v >= 0 for v in daily.values], color="green")
        plt.fill_between(daily.index, daily.values, alpha=0.3,
                        where=[v < 0 for v in daily.values], color="red")
        plt.title(f"News Sentiment Trend: {query}")
        plt.xlabel("Date")
        plt.ylabel("Sentiment Score")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig("sentiment_trend.png")
        print("Chart saved to sentiment_trend.png")
Enter fullscreen mode Exit fullscreen mode

Putting It All Together

def run_analysis(query):
    scraper = NewsScraper()
    analyzer = SentimentAnalyzer()
    tracker = SentimentTracker()

    print(f"Scraping news for: {query}")
    articles = scraper.scrape_google_news(query)
    print(f"Found {len(articles)} articles")

    print("Analyzing sentiment...")
    results = analyzer.analyze_batch(articles, scraper)

    tracker.save_results(results)

    positive = sum(1 for r in results if r["sentiment"]["label"] == "positive")
    negative = sum(1 for r in results if r["sentiment"]["label"] == "negative")
    neutral = sum(1 for r in results if r["sentiment"]["label"] == "neutral")

    print(f"\nResults for {query}:")
    print(f"  Positive: {positive}")
    print(f"  Negative: {negative}")
    print(f"  Neutral: {neutral}")

    avg_score = sum(r["sentiment"]["compound"] for r in results) / len(results)
    print(f"  Average Sentiment: {avg_score:.4f}")

run_analysis("artificial intelligence")
run_analysis("cryptocurrency regulation")
Enter fullscreen mode Exit fullscreen mode

Scaling Up

  • ScraperAPI for reliable news site scraping with JS rendering
  • ThorData for geo-specific news from different regions
  • ScrapeOps to monitor your news pipeline health

Conclusion

A news sentiment analyzer combines web scraping with NLP to surface actionable insights from the daily news cycle. Extend it with topic modeling, named entity recognition, or connect it to trading signals.


Follow for more Python NLP and scraping tutorials!

Top comments (0)