DEV Community

agenthustler
agenthustler

Posted on

News Article Scraping: RSS Feeds vs HTML Scraping in 2026

Media monitoring, sentiment analysis, and content aggregation all depend on reliable news data extraction. In 2026, you have two main approaches: RSS feeds and HTML scraping. Each has trade-offs. Let's build both and compare.

RSS Feeds: The Clean Approach

RSS feeds provide structured, machine-readable news data. Most major publications still offer them.

import feedparser
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import List, Optional
import json

@dataclass
class NewsArticle:
    title: str
    url: str
    source: str
    published: str
    summary: Optional[str]
    author: Optional[str]
    categories: List[str]

def parse_rss_feed(feed_url, source_name):
    """Parse an RSS feed and extract articles."""
    feed = feedparser.parse(feed_url)

    articles = []
    for entry in feed.entries:
        articles.append(NewsArticle(
            title=entry.get("title", ""),
            url=entry.get("link", ""),
            source=source_name,
            published=entry.get("published", ""),
            summary=entry.get("summary", ""),
            author=entry.get("author", None),
            categories=[tag.get("term", "") for tag in entry.get("tags", [])],
        ))

    return articles

# Example: major news sources
NEWS_FEEDS = {
    "Reuters": "https://feeds.reuters.com/reuters/topNews",
    "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
    "TechCrunch": "https://techcrunch.com/feed/",
    "Hacker News": "https://news.ycombinator.com/rss",
    "ArsTechnica": "https://feeds.arstechnica.com/arstechnica/index",
}

all_articles = []
for source, url in NEWS_FEEDS.items():
    articles = parse_rss_feed(url, source)
    all_articles.extend(articles)
    print(f"{source}: {len(articles)} articles")

print(f"\nTotal: {len(all_articles)} articles from {len(NEWS_FEEDS)} sources")
Enter fullscreen mode Exit fullscreen mode

HTML Scraping: The Flexible Approach

When RSS feeds don't have enough data or aren't available, HTML scraping gets you the full article content:

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def extract_article_content(url):
    """Extract full article text from a news page."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                      "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
    }

    response = requests.get(url, headers=headers, timeout=15)
    soup = BeautifulSoup(response.text, "html.parser")

    # Remove unwanted elements
    for tag in soup.find_all(["script", "style", "nav", "header",
                              "footer", "aside", "form"]):
        tag.decompose()

    # Try common article containers
    article = None
    selectors = [
        "article",
        'div[class*="article-body"]',
        'div[class*="post-content"]',
        'div[class*="entry-content"]',
        'div[itemprop="articleBody"]',
        "main",
    ]

    for selector in selectors:
        article = soup.select_one(selector)
        if article:
            break

    if not article:
        article = soup.find("body")

    # Extract text from paragraphs
    paragraphs = article.find_all("p") if article else []
    text = "\n\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30)

    # Extract metadata
    title = soup.find("title")
    meta_desc = soup.find("meta", attrs={"name": "description"})
    meta_author = soup.find("meta", attrs={"name": "author"})

    return {
        "url": url,
        "title": title.get_text(strip=True) if title else None,
        "description": meta_desc.get("content") if meta_desc else None,
        "author": meta_author.get("content") if meta_author else None,
        "content": text,
        "word_count": len(text.split()),
    }
Enter fullscreen mode Exit fullscreen mode

Building a News Aggregator

Combine RSS discovery with HTML scraping for full content:

import time
import random

class NewsAggregator:
    def __init__(self, feeds):
        self.feeds = feeds
        self.articles = []

    def collect_from_rss(self):
        """Collect article URLs from all RSS feeds."""
        for source, url in self.feeds.items():
            articles = parse_rss_feed(url, source)
            self.articles.extend(articles)
            print(f"  [{source}] {len(articles)} articles")

        print(f"Total RSS articles: {len(self.articles)}")

    def enrich_with_content(self, max_articles=20):
        """Fetch full content for top articles."""
        enriched = []
        for article in self.articles[:max_articles]:
            try:
                content = extract_article_content(article.url)
                enriched.append({
                    **asdict(article),
                    "full_content": content.get("content", ""),
                    "word_count": content.get("word_count", 0),
                })
                time.sleep(random.uniform(1, 3))
            except Exception as e:
                print(f"  Error fetching {article.url}: {e}")

        return enriched

    def export(self, articles, filename="news_data.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2, ensure_ascii=False, default=str)
        print(f"Exported {len(articles)} articles to {filename}")

# Usage
aggregator = NewsAggregator(NEWS_FEEDS)
aggregator.collect_from_rss()
enriched = aggregator.enrich_with_content(max_articles=10)
aggregator.export(enriched)
Enter fullscreen mode Exit fullscreen mode

Sentiment Analysis on News Data

from collections import Counter

# Simple keyword-based sentiment (for demo purposes)
POSITIVE_WORDS = {"growth", "profit", "success", "innovation", "breakthrough", "surge", "gain"}
NEGATIVE_WORDS = {"crash", "decline", "loss", "crisis", "recession", "failure", "plunge"}

def analyze_sentiment(text):
    words = set(text.lower().split())
    pos = len(words & POSITIVE_WORDS)
    neg = len(words & NEGATIVE_WORDS)

    if pos > neg:
        return "positive", pos - neg
    elif neg > pos:
        return "negative", neg - pos
    return "neutral", 0

def analyze_news_sentiment(articles):
    results = {"positive": 0, "negative": 0, "neutral": 0}

    for article in articles:
        text = article.get("full_content", article.get("summary", ""))
        sentiment, score = analyze_sentiment(text)
        results[sentiment] += 1

    return results
Enter fullscreen mode Exit fullscreen mode

RSS vs HTML Scraping: When to Use Which

Feature RSS Feeds HTML Scraping
Setup complexity Low Medium-High
Data structure Structured Unstructured
Full article text Usually no Yes
Real-time updates Near real-time On-demand
Rate limiting risk Very low High
Maintenance Low High
Legal risk None Gray area

Proxy Infrastructure

When scraping news sites at scale, you'll need reliable proxy infrastructure. ScrapeOps provides proxy rotation and header management optimized for content sites, with built-in rate limiting to keep you under the radar.

Conclusion

For most news monitoring projects, start with RSS feeds for discovery and URLs. Add HTML scraping only when you need full article content. This hybrid approach gives you the best balance of reliability, speed, and data richness.

Top comments (0)