News Article Scraping: RSS Feeds vs HTML Scraping in 2026

#python #webdev #tutorial #webscraping

Media monitoring, sentiment analysis, and content aggregation all depend on reliable news data extraction. In 2026, you have two main approaches: RSS feeds and HTML scraping. Each has trade-offs. Let's build both and compare.

RSS Feeds: The Clean Approach

RSS feeds provide structured, machine-readable news data. Most major publications still offer them.

import feedparser
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import List, Optional
import json

@dataclass
class NewsArticle:
    title: str
    url: str
    source: str
    published: str
    summary: Optional[str]
    author: Optional[str]
    categories: List[str]

def parse_rss_feed(feed_url, source_name):
    """Parse an RSS feed and extract articles."""
    feed = feedparser.parse(feed_url)

    articles = []
    for entry in feed.entries:
        articles.append(NewsArticle(
            title=entry.get("title", ""),
            url=entry.get("link", ""),
            source=source_name,
            published=entry.get("published", ""),
            summary=entry.get("summary", ""),
            author=entry.get("author", None),
            categories=[tag.get("term", "") for tag in entry.get("tags", [])],
        ))

    return articles

# Example: major news sources
NEWS_FEEDS = {
    "Reuters": "https://feeds.reuters.com/reuters/topNews",
    "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
    "TechCrunch": "https://techcrunch.com/feed/",
    "Hacker News": "https://news.ycombinator.com/rss",
    "ArsTechnica": "https://feeds.arstechnica.com/arstechnica/index",
}

all_articles = []
for source, url in NEWS_FEEDS.items():
    articles = parse_rss_feed(url, source)
    all_articles.extend(articles)
    print(f"{source}: {len(articles)} articles")

print(f"\nTotal: {len(all_articles)} articles from {len(NEWS_FEEDS)} sources")

HTML Scraping: The Flexible Approach

When RSS feeds don't have enough data or aren't available, HTML scraping gets you the full article content:

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Building a News Aggregator

Combine RSS discovery with HTML scraping for full content:

import time
import random

class NewsAggregator:
    def __init__(self, feeds):
        self.feeds = feeds
        self.articles = []

    def collect_from_rss(self):
        """Collect article URLs from all RSS feeds."""
        for source, url in self.feeds.items():
            articles = parse_rss_feed(url, source)
            self.articles.extend(articles)
            print(f"  [{source}] {len(articles)} articles")

        print(f"Total RSS articles: {len(self.articles)}")

    def enrich_with_content(self, max_articles=20):
        """Fetch full content for top articles."""
        enriched = []
        for article in self.articles[:max_articles]:
            try:
                content = extract_article_content(article.url)
                enriched.append({
                    **asdict(article),
                    "full_content": content.get("content", ""),
                    "word_count": content.get("word_count", 0),
                })
                time.sleep(random.uniform(1, 3))
            except Exception as e:
                print(f"  Error fetching {article.url}: {e}")

        return enriched

    def export(self, articles, filename="news_data.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2, ensure_ascii=False, default=str)
        print(f"Exported {len(articles)} articles to {filename}")

# Usage
aggregator = NewsAggregator(NEWS_FEEDS)
aggregator.collect_from_rss()
enriched = aggregator.enrich_with_content(max_articles=10)
aggregator.export(enriched)

Sentiment Analysis on News Data

from collections import Counter

# Simple keyword-based sentiment (for demo purposes)
POSITIVE_WORDS = {"growth", "profit", "success", "innovation", "breakthrough", "surge", "gain"}
NEGATIVE_WORDS = {"crash", "decline", "loss", "crisis", "recession", "failure", "plunge"}

def analyze_sentiment(text):
    words = set(text.lower().split())
    pos = len(words & POSITIVE_WORDS)
    neg = len(words & NEGATIVE_WORDS)

    if pos > neg:
        return "positive", pos - neg
    elif neg > pos:
        return "negative", neg - pos
    return "neutral", 0

def analyze_news_sentiment(articles):
    results = {"positive": 0, "negative": 0, "neutral": 0}

    for article in articles:
        text = article.get("full_content", article.get("summary", ""))
        sentiment, score = analyze_sentiment(text)
        results[sentiment] += 1

    return results

RSS vs HTML Scraping: When to Use Which

Feature	RSS Feeds	HTML Scraping
Setup complexity	Low	Medium-High
Data structure	Structured	Unstructured
Full article text	Usually no	Yes
Real-time updates	Near real-time	On-demand
Rate limiting risk	Very low	High
Maintenance	Low	High
Legal risk	None	Gray area

Proxy Infrastructure

When scraping news sites at scale, you'll need reliable proxy infrastructure. ScrapeOps provides proxy rotation and header management optimized for content sites, with built-in rate limiting to keep you under the radar.

Conclusion

For most news monitoring projects, start with RSS feeds for discovery and URLs. Add HTML scraping only when you need full article content. This hybrid approach gives you the best balance of reliability, speed, and data richness.