Media monitoring, sentiment analysis, and content aggregation all depend on reliable news data extraction. In 2026, you have two main approaches: RSS feeds and HTML scraping. Each has trade-offs. Let's build both and compare.
RSS Feeds: The Clean Approach
RSS feeds provide structured, machine-readable news data. Most major publications still offer them.
import feedparser
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import List, Optional
import json
@dataclass
class NewsArticle:
title: str
url: str
source: str
published: str
summary: Optional[str]
author: Optional[str]
categories: List[str]
def parse_rss_feed(feed_url, source_name):
"""Parse an RSS feed and extract articles."""
feed = feedparser.parse(feed_url)
articles = []
for entry in feed.entries:
articles.append(NewsArticle(
title=entry.get("title", ""),
url=entry.get("link", ""),
source=source_name,
published=entry.get("published", ""),
summary=entry.get("summary", ""),
author=entry.get("author", None),
categories=[tag.get("term", "") for tag in entry.get("tags", [])],
))
return articles
# Example: major news sources
NEWS_FEEDS = {
"Reuters": "https://feeds.reuters.com/reuters/topNews",
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
"TechCrunch": "https://techcrunch.com/feed/",
"Hacker News": "https://news.ycombinator.com/rss",
"ArsTechnica": "https://feeds.arstechnica.com/arstechnica/index",
}
all_articles = []
for source, url in NEWS_FEEDS.items():
articles = parse_rss_feed(url, source)
all_articles.extend(articles)
print(f"{source}: {len(articles)} articles")
print(f"\nTotal: {len(all_articles)} articles from {len(NEWS_FEEDS)} sources")
HTML Scraping: The Flexible Approach
When RSS feeds don't have enough data or aren't available, HTML scraping gets you the full article content:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
def extract_article_content(url):
"""Extract full article text from a news page."""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
}
response = requests.get(url, headers=headers, timeout=15)
soup = BeautifulSoup(response.text, "html.parser")
# Remove unwanted elements
for tag in soup.find_all(["script", "style", "nav", "header",
"footer", "aside", "form"]):
tag.decompose()
# Try common article containers
article = None
selectors = [
"article",
'div[class*="article-body"]',
'div[class*="post-content"]',
'div[class*="entry-content"]',
'div[itemprop="articleBody"]',
"main",
]
for selector in selectors:
article = soup.select_one(selector)
if article:
break
if not article:
article = soup.find("body")
# Extract text from paragraphs
paragraphs = article.find_all("p") if article else []
text = "\n\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30)
# Extract metadata
title = soup.find("title")
meta_desc = soup.find("meta", attrs={"name": "description"})
meta_author = soup.find("meta", attrs={"name": "author"})
return {
"url": url,
"title": title.get_text(strip=True) if title else None,
"description": meta_desc.get("content") if meta_desc else None,
"author": meta_author.get("content") if meta_author else None,
"content": text,
"word_count": len(text.split()),
}
Building a News Aggregator
Combine RSS discovery with HTML scraping for full content:
import time
import random
class NewsAggregator:
def __init__(self, feeds):
self.feeds = feeds
self.articles = []
def collect_from_rss(self):
"""Collect article URLs from all RSS feeds."""
for source, url in self.feeds.items():
articles = parse_rss_feed(url, source)
self.articles.extend(articles)
print(f" [{source}] {len(articles)} articles")
print(f"Total RSS articles: {len(self.articles)}")
def enrich_with_content(self, max_articles=20):
"""Fetch full content for top articles."""
enriched = []
for article in self.articles[:max_articles]:
try:
content = extract_article_content(article.url)
enriched.append({
**asdict(article),
"full_content": content.get("content", ""),
"word_count": content.get("word_count", 0),
})
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f" Error fetching {article.url}: {e}")
return enriched
def export(self, articles, filename="news_data.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(articles, f, indent=2, ensure_ascii=False, default=str)
print(f"Exported {len(articles)} articles to {filename}")
# Usage
aggregator = NewsAggregator(NEWS_FEEDS)
aggregator.collect_from_rss()
enriched = aggregator.enrich_with_content(max_articles=10)
aggregator.export(enriched)
Sentiment Analysis on News Data
from collections import Counter
# Simple keyword-based sentiment (for demo purposes)
POSITIVE_WORDS = {"growth", "profit", "success", "innovation", "breakthrough", "surge", "gain"}
NEGATIVE_WORDS = {"crash", "decline", "loss", "crisis", "recession", "failure", "plunge"}
def analyze_sentiment(text):
words = set(text.lower().split())
pos = len(words & POSITIVE_WORDS)
neg = len(words & NEGATIVE_WORDS)
if pos > neg:
return "positive", pos - neg
elif neg > pos:
return "negative", neg - pos
return "neutral", 0
def analyze_news_sentiment(articles):
results = {"positive": 0, "negative": 0, "neutral": 0}
for article in articles:
text = article.get("full_content", article.get("summary", ""))
sentiment, score = analyze_sentiment(text)
results[sentiment] += 1
return results
RSS vs HTML Scraping: When to Use Which
| Feature | RSS Feeds | HTML Scraping |
|---|---|---|
| Setup complexity | Low | Medium-High |
| Data structure | Structured | Unstructured |
| Full article text | Usually no | Yes |
| Real-time updates | Near real-time | On-demand |
| Rate limiting risk | Very low | High |
| Maintenance | Low | High |
| Legal risk | None | Gray area |
Proxy Infrastructure
When scraping news sites at scale, you'll need reliable proxy infrastructure. ScrapeOps provides proxy rotation and header management optimized for content sites, with built-in rate limiting to keep you under the radar.
Conclusion
For most news monitoring projects, start with RSS feeds for discovery and URLs. Add HTML scraping only when you need full article content. This hybrid approach gives you the best balance of reliability, speed, and data richness.
Top comments (0)