Building a News Sentiment Analyzer with Web Scraping

#python #tutorial #webdev #programming

Building a News Sentiment Analyzer with Web Scraping

Sentiment analysis on news articles can predict market movements, track brand reputation, and identify trending narratives. Here is how to build one from scratch with Python.

What We Are Building

News scraper that collects articles from multiple sources
Sentiment engine using TextBlob and VADER
Trend tracker that monitors sentiment over time
Dashboard to visualize results

Setup

pip install requests beautifulsoup4 textblob vaderSentiment pandas matplotlib
python -m textblob.download_corpora

The News Scraper

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time

class NewsScraper:
    def __init__(self, proxy_key=None):
        self.session = requests.Session()
        self.proxy_key = proxy_key
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        })

    def fetch(self, url):
        if self.proxy_key:
            api_url = f"http://api.scraperapi.com?api_key={self.proxy_key}&url={url}"
            return self.session.get(api_url, timeout=30)
        return self.session.get(url, timeout=15)

    def scrape_google_news(self, query, num_results=20):
        url = f"https://news.google.com/search?q={query}&hl=en-US&gl=US"
        response = self.fetch(url)
        soup = BeautifulSoup(response.text, "html.parser")

        articles = []
        for article in soup.select("article")[:num_results]:
            title_el = article.select_one("h3, h4")
            source_el = article.select_one("[data-n-tid]")
            time_el = article.select_one("time")
            link_el = article.select_one("a[href]")

            if title_el:
                articles.append({
                    "title": title_el.text.strip(),
                    "source": source_el.text.strip() if source_el else "Unknown",
                    "published": time_el.get("datetime", "") if time_el else "",
                    "url": "https://news.google.com" + link_el["href"][1:] if link_el else "",
                    "scraped_at": datetime.now().isoformat()
                })
        return articles

    def scrape_article_content(self, url):
        try:
            response = self.fetch(url)
            soup = BeautifulSoup(response.text, "html.parser")

            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()

            paragraphs = soup.find_all("p")
            content = " ".join(p.text.strip() for p in paragraphs if len(p.text.strip()) > 50)
            return content[:5000]
        except Exception:
            return ""

Sentiment Analysis Engine

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer:
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()

    def analyze(self, text):
        if not text:
            return {"compound": 0, "label": "neutral", "confidence": 0}

        vader_scores = self.vader.polarity_scores(text)
        blob = TextBlob(text)

        compound = (vader_scores["compound"] + blob.sentiment.polarity) / 2

        if compound >= 0.05:
            label = "positive"
        elif compound <= -0.05:
            label = "negative"
        else:
            label = "neutral"

        confidence = abs(compound)

        return {
            "compound": round(compound, 4),
            "label": label,
            "confidence": round(confidence, 4),
            "vader": vader_scores,
            "textblob": {
                "polarity": round(blob.sentiment.polarity, 4),
                "subjectivity": round(blob.sentiment.subjectivity, 4)
            }
        }

    def analyze_batch(self, articles, scraper=None):
        results = []
        for article in articles:
            content = article.get("content", article.get("title", ""))
            if scraper and not article.get("content"):
                content = scraper.scrape_article_content(article.get("url", ""))
                time.sleep(1)

            sentiment = self.analyze(content)
            results.append({**article, "sentiment": sentiment})
        return results

Trend Tracking

import pandas as pd
import matplotlib.pyplot as plt

class SentimentTracker:
    def __init__(self, data_file="sentiment_history.csv"):
        self.data_file = data_file

    def save_results(self, results):
        rows = []
        for r in results:
            rows.append({
                "title": r["title"],
                "source": r["source"],
                "compound": r["sentiment"]["compound"],
                "label": r["sentiment"]["label"],
                "scraped_at": r["scraped_at"]
            })

        df = pd.DataFrame(rows)
        df.to_csv(self.data_file, mode="a", header=not pd.io.common.file_exists(self.data_file), index=False)

    def plot_sentiment_trend(self, query):
        df = pd.read_csv(self.data_file)
        df["scraped_at"] = pd.to_datetime(df["scraped_at"])

        daily = df.groupby(df["scraped_at"].dt.date)["compound"].mean()

        plt.figure(figsize=(12, 6))
        plt.plot(daily.index, daily.values, marker="o")
        plt.axhline(y=0, color="gray", linestyle="--")
        plt.fill_between(daily.index, daily.values, alpha=0.3,
                        where=[v >= 0 for v in daily.values], color="green")
        plt.fill_between(daily.index, daily.values, alpha=0.3,
                        where=[v < 0 for v in daily.values], color="red")
        plt.title(f"News Sentiment Trend: {query}")
        plt.xlabel("Date")
        plt.ylabel("Sentiment Score")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig("sentiment_trend.png")
        print("Chart saved to sentiment_trend.png")

Putting It All Together

def run_analysis(query):
    scraper = NewsScraper()
    analyzer = SentimentAnalyzer()
    tracker = SentimentTracker()

    print(f"Scraping news for: {query}")
    articles = scraper.scrape_google_news(query)
    print(f"Found {len(articles)} articles")

    print("Analyzing sentiment...")
    results = analyzer.analyze_batch(articles, scraper)

    tracker.save_results(results)

    positive = sum(1 for r in results if r["sentiment"]["label"] == "positive")
    negative = sum(1 for r in results if r["sentiment"]["label"] == "negative")
    neutral = sum(1 for r in results if r["sentiment"]["label"] == "neutral")

    print(f"\nResults for {query}:")
    print(f"  Positive: {positive}")
    print(f"  Negative: {negative}")
    print(f"  Neutral: {neutral}")

    avg_score = sum(r["sentiment"]["compound"] for r in results) / len(results)
    print(f"  Average Sentiment: {avg_score:.4f}")

run_analysis("artificial intelligence")
run_analysis("cryptocurrency regulation")

Scaling Up

ScraperAPI for reliable news site scraping with JS rendering
ThorData for geo-specific news from different regions
ScrapeOps to monitor your news pipeline health

Conclusion

A news sentiment analyzer combines web scraping with NLP to surface actionable insights from the daily news cycle. Extend it with topic modeling, named entity recognition, or connect it to trading signals.

Follow for more Python NLP and scraping tutorials!

DEV Community

Building a News Sentiment Analyzer with Web Scraping

Building a News Sentiment Analyzer with Web Scraping

What We Are Building

Setup

The News Scraper

Sentiment Analysis Engine

Trend Tracking

Putting It All Together

Scaling Up

Conclusion

Top comments (0)