Building a News Sentiment Analyzer with Web Scraping
Sentiment analysis on news articles can predict market movements, track brand reputation, and identify trending narratives. Here is how to build one from scratch with Python.
What We Are Building
- News scraper that collects articles from multiple sources
- Sentiment engine using TextBlob and VADER
- Trend tracker that monitors sentiment over time
- Dashboard to visualize results
Setup
pip install requests beautifulsoup4 textblob vaderSentiment pandas matplotlib
python -m textblob.download_corpora
The News Scraper
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
class NewsScraper:
def __init__(self, proxy_key=None):
self.session = requests.Session()
self.proxy_key = proxy_key
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
})
def fetch(self, url):
if self.proxy_key:
api_url = f"http://api.scraperapi.com?api_key={self.proxy_key}&url={url}"
return self.session.get(api_url, timeout=30)
return self.session.get(url, timeout=15)
def scrape_google_news(self, query, num_results=20):
url = f"https://news.google.com/search?q={query}&hl=en-US&gl=US"
response = self.fetch(url)
soup = BeautifulSoup(response.text, "html.parser")
articles = []
for article in soup.select("article")[:num_results]:
title_el = article.select_one("h3, h4")
source_el = article.select_one("[data-n-tid]")
time_el = article.select_one("time")
link_el = article.select_one("a[href]")
if title_el:
articles.append({
"title": title_el.text.strip(),
"source": source_el.text.strip() if source_el else "Unknown",
"published": time_el.get("datetime", "") if time_el else "",
"url": "https://news.google.com" + link_el["href"][1:] if link_el else "",
"scraped_at": datetime.now().isoformat()
})
return articles
def scrape_article_content(self, url):
try:
response = self.fetch(url)
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
paragraphs = soup.find_all("p")
content = " ".join(p.text.strip() for p in paragraphs if len(p.text.strip()) > 50)
return content[:5000]
except Exception:
return ""
Sentiment Analysis Engine
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
class SentimentAnalyzer:
def __init__(self):
self.vader = SentimentIntensityAnalyzer()
def analyze(self, text):
if not text:
return {"compound": 0, "label": "neutral", "confidence": 0}
vader_scores = self.vader.polarity_scores(text)
blob = TextBlob(text)
compound = (vader_scores["compound"] + blob.sentiment.polarity) / 2
if compound >= 0.05:
label = "positive"
elif compound <= -0.05:
label = "negative"
else:
label = "neutral"
confidence = abs(compound)
return {
"compound": round(compound, 4),
"label": label,
"confidence": round(confidence, 4),
"vader": vader_scores,
"textblob": {
"polarity": round(blob.sentiment.polarity, 4),
"subjectivity": round(blob.sentiment.subjectivity, 4)
}
}
def analyze_batch(self, articles, scraper=None):
results = []
for article in articles:
content = article.get("content", article.get("title", ""))
if scraper and not article.get("content"):
content = scraper.scrape_article_content(article.get("url", ""))
time.sleep(1)
sentiment = self.analyze(content)
results.append({**article, "sentiment": sentiment})
return results
Trend Tracking
import pandas as pd
import matplotlib.pyplot as plt
class SentimentTracker:
def __init__(self, data_file="sentiment_history.csv"):
self.data_file = data_file
def save_results(self, results):
rows = []
for r in results:
rows.append({
"title": r["title"],
"source": r["source"],
"compound": r["sentiment"]["compound"],
"label": r["sentiment"]["label"],
"scraped_at": r["scraped_at"]
})
df = pd.DataFrame(rows)
df.to_csv(self.data_file, mode="a", header=not pd.io.common.file_exists(self.data_file), index=False)
def plot_sentiment_trend(self, query):
df = pd.read_csv(self.data_file)
df["scraped_at"] = pd.to_datetime(df["scraped_at"])
daily = df.groupby(df["scraped_at"].dt.date)["compound"].mean()
plt.figure(figsize=(12, 6))
plt.plot(daily.index, daily.values, marker="o")
plt.axhline(y=0, color="gray", linestyle="--")
plt.fill_between(daily.index, daily.values, alpha=0.3,
where=[v >= 0 for v in daily.values], color="green")
plt.fill_between(daily.index, daily.values, alpha=0.3,
where=[v < 0 for v in daily.values], color="red")
plt.title(f"News Sentiment Trend: {query}")
plt.xlabel("Date")
plt.ylabel("Sentiment Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("sentiment_trend.png")
print("Chart saved to sentiment_trend.png")
Putting It All Together
def run_analysis(query):
scraper = NewsScraper()
analyzer = SentimentAnalyzer()
tracker = SentimentTracker()
print(f"Scraping news for: {query}")
articles = scraper.scrape_google_news(query)
print(f"Found {len(articles)} articles")
print("Analyzing sentiment...")
results = analyzer.analyze_batch(articles, scraper)
tracker.save_results(results)
positive = sum(1 for r in results if r["sentiment"]["label"] == "positive")
negative = sum(1 for r in results if r["sentiment"]["label"] == "negative")
neutral = sum(1 for r in results if r["sentiment"]["label"] == "neutral")
print(f"\nResults for {query}:")
print(f" Positive: {positive}")
print(f" Negative: {negative}")
print(f" Neutral: {neutral}")
avg_score = sum(r["sentiment"]["compound"] for r in results) / len(results)
print(f" Average Sentiment: {avg_score:.4f}")
run_analysis("artificial intelligence")
run_analysis("cryptocurrency regulation")
Scaling Up
- ScraperAPI for reliable news site scraping with JS rendering
- ThorData for geo-specific news from different regions
- ScrapeOps to monitor your news pipeline health
Conclusion
A news sentiment analyzer combines web scraping with NLP to surface actionable insights from the daily news cycle. Extend it with topic modeling, named entity recognition, or connect it to trading signals.
Follow for more Python NLP and scraping tutorials!
Top comments (0)