Substack has become the go-to platform for independent writers, with thousands of newsletters covering tech, politics, culture, and more. Scraping Substack data lets you analyze newsletter trends, track content strategies, and build competitive intelligence dashboards.
Here's how to extract Substack posts and subscriber data with Python.
Why Scrape Substack?
- Newsletter intelligence: Track what topics are trending
- Competitive analysis: Monitor competing newsletters in your niche
- Content research: Find high-performing post formats and topics
- Market sizing: Estimate subscriber counts across categories
- Trend detection: Spot emerging topics before they go mainstream
Substack's API Structure
Substack provides a semi-public API that makes scraping relatively straightforward:
import requests
import json
import time
def get_newsletter_posts(subdomain, limit=50):
"""Fetch posts from a Substack newsletter via their API."""
posts = []
offset = 0
while len(posts) < limit:
url = f"https://{subdomain}.substack.com/api/v1/posts?offset={offset}&limit=12"
response = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
})
if response.status_code != 200:
print(f"Error: {response.status_code}")
break
data = response.json()
if not data:
break
for post in data:
posts.append({
"title": post.get("title", ""),
"subtitle": post.get("subtitle", ""),
"slug": post.get("slug", ""),
"post_date": post.get("post_date", ""),
"type": post.get("type", ""),
"is_paid": post.get("audience") == "only_paid",
"word_count": post.get("wordcount", 0),
"reactions": post.get("reactions", {}).get("\u2764", 0),
"comment_count": post.get("comment_count", 0),
"url": f"https://{subdomain}.substack.com/p/{post.get('slug', '')}",
})
offset += 12
time.sleep(1)
return posts[:limit]
Extracting Newsletter Metadata
def get_newsletter_info(subdomain):
"""Get newsletter metadata including subscriber count."""
url = f"https://{subdomain}.substack.com/api/v1/publication"
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
return None
data = response.json()
return {
"name": data.get("name", ""),
"subdomain": subdomain,
"author_name": data.get("author_name", ""),
"description": data.get("description", ""),
"subscriber_count": data.get("subscriber_count"),
"base_url": data.get("base_url", ""),
"logo_url": data.get("logo_url", ""),
"created_at": data.get("created_at", ""),
}
Building a Newsletter Tracker
from bs4 import BeautifulSoup
import csv
def analyze_newsletter(subdomain):
"""Complete analysis of a Substack newsletter."""
info = get_newsletter_info(subdomain)
posts = get_newsletter_posts(subdomain, limit=100)
if not info or not posts:
return None
total_reactions = sum(p["reactions"] for p in posts)
total_comments = sum(p["comment_count"] for p in posts)
avg_word_count = sum(p["word_count"] for p in posts) / len(posts)
paid_ratio = sum(1 for p in posts if p["is_paid"]) / len(posts)
return {
**info,
"total_posts": len(posts),
"avg_reactions": total_reactions / len(posts),
"avg_comments": total_comments / len(posts),
"avg_word_count": round(avg_word_count),
"paid_content_ratio": round(paid_ratio, 2),
"top_posts": sorted(posts, key=lambda x: x["reactions"], reverse=True)[:5],
}
newsletters = ["platformer", "lennysnewsletter", "thegeneralist"]
results = []
for subdomain in newsletters:
print(f"Analyzing {subdomain}...")
analysis = analyze_newsletter(subdomain)
if analysis:
results.append(analysis)
time.sleep(2)
Scaling with Managed Scrapers
For large-scale Substack scraping, the Substack Scraper on Apify handles pagination, rate limiting, and data normalization automatically.
For handling anti-bot protections on any platform, ScrapeOps provides proxy rotation and header management that keeps your scrapers running smoothly.
Export to Database
import sqlite3
def save_to_database(newsletters_data, db_path="substack_data.db"):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS newsletters (
subdomain TEXT PRIMARY KEY,
name TEXT,
subscriber_count INTEGER,
total_posts INTEGER,
avg_reactions REAL,
paid_content_ratio REAL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
for nl in newsletters_data:
cursor.execute('''
INSERT OR REPLACE INTO newsletters
(subdomain, name, subscriber_count, total_posts, avg_reactions, paid_content_ratio)
VALUES (?, ?, ?, ?, ?, ?)
''', (nl["subdomain"], nl["name"], nl.get("subscriber_count"),
nl["total_posts"], nl["avg_reactions"], nl["paid_content_ratio"]))
conn.commit()
conn.close()
Best Practices
- Use the API first: Substack's API is more reliable than HTML scraping
- Rate limit: 1-2 seconds between requests to avoid blocks
- Proxy rotation: For large-scale scraping, use ScrapeOps proxy management
- Cache aggressively: Post content rarely changes after publication
- Respect paywalled content: Only extract metadata from paid posts
Conclusion
Substack's semi-public API makes it one of the easier platforms to scrape for newsletter intelligence. Whether you're building a custom tracker or using the Substack Scraper on Apify for production workloads, the data can power valuable insights into the newsletter ecosystem.
Happy scraping!
Top comments (0)