How to Build an AI News Monitor That Summarizes and Scores Industry Stories
Manually reading industry news is expensive: 30-60 minutes/day tracking tech, competitor, and market stories. Most of it irrelevant. A few articles highly relevant.
Here's how to build an automated news monitor that scrapes, summarizes, and scores news relevance — using Python and LLMs.
What We're Building
A pipeline that:
- Scrapes news from TechCrunch, Reuters, Google News, and custom RSS feeds
- Filters by keywords/topics relevant to your industry
- Summarizes each article in 2-3 sentences
- Scores relevance (1-10) and sentiment (positive/negative/neutral)
- Delivers a daily digest via email or Slack
Step 1: Scraping News Sources
Option A: RSS Feeds (Simplest)
import feedparser
import requests
from datetime import datetime, timedelta
import time
# RSS feeds for common tech/business news
RSS_FEEDS = {
"TechCrunch": "https://techcrunch.com/feed/",
"Reuters Technology": "https://feeds.reuters.com/reuters/technologyNews",
"Hacker News": "https://hnrss.org/frontpage",
"Product Hunt": "https://www.producthunt.com/feed",
"VentureBeat": "https://feeds.feedburner.com/venturebeat/SZYF",
}
def fetch_recent_articles(feed_url, hours_back=24):
"""Fetch articles published in the last N hours from an RSS feed."""
feed = feedparser.parse(feed_url)
cutoff = datetime.now() - timedelta(hours=hours_back)
articles = []
for entry in feed.entries:
# Parse published date
published = entry.get('published_parsed')
if published:
pub_date = datetime(*published[:6])
if pub_date < cutoff:
continue
articles.append({
'title': entry.get('title', ''),
'url': entry.get('link', ''),
'summary': entry.get('summary', '')[:500],
'published': str(published),
'source': feed.feed.get('title', 'Unknown'),
})
return articles
# Fetch from all sources
all_articles = []
for source, url in RSS_FEEDS.items():
try:
articles = fetch_recent_articles(url, hours_back=24)
all_articles.extend(articles)
print(f"{source}: {len(articles)} articles")
time.sleep(0.5)
except Exception as e:
print(f"Error fetching {source}: {e}")
print(f"\nTotal articles: {len(all_articles)}")
Option B: Google News (More Sources)
from urllib.parse import quote_plus
import requests
from bs4 import BeautifulSoup
def scrape_google_news(query, max_results=20):
"""Scrape Google News search results for a query."""
url = f"https://news.google.com/rss/search?q={quote_plus(query)}&hl=en-US&gl=US&ceid=US:en"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/124.0.0.0"
}
r = requests.get(url, headers=headers, timeout=15)
feed = feedparser.parse(r.content)
articles = []
for entry in feed.entries[:max_results]:
articles.append({
'title': entry.get('title', ''),
'url': entry.get('link', ''),
'summary': entry.get('summary', '')[:300],
'published': entry.get('published', ''),
'source': 'Google News',
})
return articles
# Monitor specific topics
topics = ["AI startup funding", "web scraping", "data privacy GDPR", "Python releases"]
for topic in topics:
results = scrape_google_news(topic)
print(f"'{topic}': {len(results)} articles")
time.sleep(1)
Option C: Apify Actor (Multi-Source Without Code)
For news monitoring across dozens of sources without managing scrapers:
import requests, time
run = requests.post(
"https://api.apify.com/v2/acts/lanky_quantifier~ai-news-summarizer/runs",
headers={"Authorization": "Bearer YOUR_APIFY_TOKEN"},
json={
"sources": ["techcrunch", "reuters", "google-news", "hackernews"],
"keywords": ["AI", "automation", "web scraping", "Python"],
"hoursBack": 24,
"includeSentiment": True,
"includeEntities": True,
}
).json()["data"]
# Wait
while True:
status = requests.get(
f"https://api.apify.com/v2/actor-runs/{run['id']}",
headers={"Authorization": "Bearer YOUR_APIFY_TOKEN"}
).json()["data"]["status"]
if status in ("SUCCEEDED", "FAILED"): break
time.sleep(5)
articles = requests.get(
f"https://api.apify.com/v2/actor-runs/{run['id']}/dataset/items",
headers={"Authorization": "Bearer YOUR_APIFY_TOKEN"}
).json()
for a in articles[:3]:
print(f"[{a['sentiment']}] {a['title']}")
print(f" {a.get('summary','')[:150]}")
Step 2: Filter by Relevance
def is_relevant(article, keywords, min_keyword_hits=1):
"""Filter articles by keyword relevance."""
text = (article['title'] + ' ' + article['summary']).lower()
hits = sum(1 for kw in keywords if kw.lower() in text)
return hits >= min_keyword_hits
# Define your monitoring keywords
RELEVANT_KEYWORDS = [
"artificial intelligence", "machine learning", "automation",
"web scraping", "data extraction", "API", "Python",
# Add competitors:
"competitor_name", "industry_term",
]
filtered = [a for a in all_articles if is_relevant(a, RELEVANT_KEYWORDS)]
print(f"Filtered: {len(filtered)} relevant from {len(all_articles)} total")
Step 3: Summarize and Score with LLM
import openai
import json
client = openai.OpenAI()
def analyze_article(article):
"""Use GPT-4o-mini to summarize and score article relevance."""
prompt = f"""Analyze this article for a developer-focused SaaS company.
Article: {article['title']}
Content: {article['summary']}
Return JSON:
{{
"summary": "2-3 sentence summary",
"relevance_score": 1-10, // How relevant to developer/SaaS/automation topics
"sentiment": "positive|negative|neutral",
"key_entities": ["company1", "person1"],
"action_required": true|false, // Does this require immediate attention?
"tags": ["funding", "release", "competitor", "regulation"]
}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
max_tokens=300,
)
return json.loads(response.choices[0].message.content)
# Analyze filtered articles
results = []
for article in filtered[:20]: # Limit to control costs (~$0.01 per article)
analysis = analyze_article(article)
results.append({**article, **analysis})
time.sleep(0.1) # Rate limit
# Sort by relevance
results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
Step 4: Deliver as Daily Digest
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
def send_digest(articles, recipient_email):
"""Send top articles as daily email digest."""
# Build email body
lines = ["<h2>Your Daily Tech News Digest</h2>", "<hr>"]
high_priority = [a for a in articles if a.get('relevance_score', 0) >= 7]
normal = [a for a in articles if a.get('relevance_score', 0) < 7]
if high_priority:
lines.append("<h3>🔥 High Priority</h3>")
for a in high_priority[:5]:
sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "📰"}.get(a.get('sentiment',''), "📰")
lines.append(f"<p><b>{sentiment_emoji} <a href='{a['url']}'>{a['title']}</a></b>")
lines.append(f"<br><i>{a.get('summary','')}</i></p>")
if normal:
lines.append("<h3>📋 Other Articles</h3>")
for a in normal[:10]:
lines.append(f"<p>• <a href='{a['url']}'>{a['title']}</a> ({a.get('source','')})</p>")
body = "\n".join(lines)
msg = MIMEMultipart('alternative')
msg['Subject'] = f"News Digest: {len(high_priority)} high-priority items"
msg['From'] = "digest@yourdomain.com"
msg['To'] = recipient_email
msg.attach(MIMEText(body, 'html'))
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
server.login("your@gmail.com", "app_password")
server.send_message(msg)
print(f"Digest sent to {recipient_email}")
send_digest(results, "you@company.com")
Alternative: Slack Delivery
import requests
def send_to_slack(articles, webhook_url):
"""Send top articles to a Slack channel."""
blocks = [
{"type": "header", "text": {"type": "plain_text", "text": "📰 Daily News Digest"}},
{"type": "divider"}
]
for a in articles[:5]:
score = a.get('relevance_score', 0)
emoji = "🔥" if score >= 8 else "📌" if score >= 6 else "📎"
blocks.append({
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"{emoji} *<{a['url']}|{a['title']}>*\n{a.get('summary','')[:200]}"
}
})
requests.post(webhook_url, json={"blocks": blocks})
send_to_slack(results, "https://hooks.slack.com/services/YOUR_WEBHOOK")
Cost and Scheduling
| Component | Cost |
|---|---|
| RSS scraping | Free |
| Google News scraping | Free |
| GPT-4o-mini analysis (100 articles/day) | ~$0.05/day |
| Apify multi-source actor (30 day runs/month) | ~$2-3/month |
Total: ~$3-5/month for daily monitoring of 100+ sources.
Schedule with a cron job or n8n to run each morning at 7am:
# crontab -e
0 7 * * * /usr/bin/python3 /path/to/news_digest.py
Use Cases
Startup monitoring: Track competitor funding, launches, and press coverage automatically
Due diligence: Monitor news about companies you're evaluating before meetings
Content marketing: Find trending topics in your space before they peak
Market intelligence: Track regulatory changes, technology shifts, and industry events
Investor research: Follow portfolio companies and their competitors
The key insight: with LLM filtering, you can monitor 500+ sources and surface only the 5-10 articles that actually matter each day.
Save hours on scraping setup: The $29 Apify Scrapers Bundle includes 35+ production-ready actors — Google SERP, LinkedIn, Amazon, TikTok, contact info, and more. Pre-configured inputs, working on day one.
Top comments (0)