Substack Newsletter Scraping: Extract Posts and Subscriber Counts with Python

#webdev #python #webscraping #tutorial

Substack has become the go-to platform for independent writers, with thousands of newsletters covering tech, politics, culture, and more. Scraping Substack data lets you analyze newsletter trends, track content strategies, and build competitive intelligence dashboards.

Here's how to extract Substack posts and subscriber data with Python.

Why Scrape Substack?

Newsletter intelligence: Track what topics are trending
Competitive analysis: Monitor competing newsletters in your niche
Content research: Find high-performing post formats and topics
Market sizing: Estimate subscriber counts across categories
Trend detection: Spot emerging topics before they go mainstream

Substack's API Structure

Substack provides a semi-public API that makes scraping relatively straightforward:

import requests
import json
import time

def get_newsletter_posts(subdomain, limit=50):
    """Fetch posts from a Substack newsletter via their API."""
    posts = []
    offset = 0

    while len(posts) < limit:
        url = f"https://{subdomain}.substack.com/api/v1/posts?offset={offset}&limit=12"
        response = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        })

        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break

        data = response.json()
        if not data:
            break

        for post in data:
            posts.append({
                "title": post.get("title", ""),
                "subtitle": post.get("subtitle", ""),
                "slug": post.get("slug", ""),
                "post_date": post.get("post_date", ""),
                "type": post.get("type", ""),
                "is_paid": post.get("audience") == "only_paid",
                "word_count": post.get("wordcount", 0),
                "reactions": post.get("reactions", {}).get("\u2764", 0),
                "comment_count": post.get("comment_count", 0),
                "url": f"https://{subdomain}.substack.com/p/{post.get('slug', '')}",
            })

        offset += 12
        time.sleep(1)

    return posts[:limit]

Extracting Newsletter Metadata

def get_newsletter_info(subdomain):
    """Get newsletter metadata including subscriber count."""
    url = f"https://{subdomain}.substack.com/api/v1/publication"
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

    if response.status_code != 200:
        return None

    data = response.json()
    return {
        "name": data.get("name", ""),
        "subdomain": subdomain,
        "author_name": data.get("author_name", ""),
        "description": data.get("description", ""),
        "subscriber_count": data.get("subscriber_count"),
        "base_url": data.get("base_url", ""),
        "logo_url": data.get("logo_url", ""),
        "created_at": data.get("created_at", ""),
    }

Building a Newsletter Tracker

from bs4 import BeautifulSoup
import csv

def analyze_newsletter(subdomain):
    """Complete analysis of a Substack newsletter."""
    info = get_newsletter_info(subdomain)
    posts = get_newsletter_posts(subdomain, limit=100)

    if not info or not posts:
        return None

    total_reactions = sum(p["reactions"] for p in posts)
    total_comments = sum(p["comment_count"] for p in posts)
    avg_word_count = sum(p["word_count"] for p in posts) / len(posts)
    paid_ratio = sum(1 for p in posts if p["is_paid"]) / len(posts)

    return {
        **info,
        "total_posts": len(posts),
        "avg_reactions": total_reactions / len(posts),
        "avg_comments": total_comments / len(posts),
        "avg_word_count": round(avg_word_count),
        "paid_content_ratio": round(paid_ratio, 2),
        "top_posts": sorted(posts, key=lambda x: x["reactions"], reverse=True)[:5],
    }

newsletters = ["platformer", "lennysnewsletter", "thegeneralist"]
results = []
for subdomain in newsletters:
    print(f"Analyzing {subdomain}...")
    analysis = analyze_newsletter(subdomain)
    if analysis:
        results.append(analysis)
    time.sleep(2)

Scaling with Managed Scrapers

For large-scale Substack scraping, the Substack Scraper on Apify handles pagination, rate limiting, and data normalization automatically.

For handling anti-bot protections on any platform, ScrapeOps provides proxy rotation and header management that keeps your scrapers running smoothly.

Export to Database

import sqlite3

def save_to_database(newsletters_data, db_path="substack_data.db"):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS newsletters (
            subdomain TEXT PRIMARY KEY,
            name TEXT,
            subscriber_count INTEGER,
            total_posts INTEGER,
            avg_reactions REAL,
            paid_content_ratio REAL,
            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')

    for nl in newsletters_data:
        cursor.execute('''
            INSERT OR REPLACE INTO newsletters
            (subdomain, name, subscriber_count, total_posts, avg_reactions, paid_content_ratio)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (nl["subdomain"], nl["name"], nl.get("subscriber_count"),
              nl["total_posts"], nl["avg_reactions"], nl["paid_content_ratio"]))

    conn.commit()
    conn.close()

Best Practices

Use the API first: Substack's API is more reliable than HTML scraping
Rate limit: 1-2 seconds between requests to avoid blocks
Proxy rotation: For large-scale scraping, use ScrapeOps proxy management
Cache aggressively: Post content rarely changes after publication
Respect paywalled content: Only extract metadata from paid posts

Conclusion

Substack's semi-public API makes it one of the easier platforms to scrape for newsletter intelligence. Whether you're building a custom tracker or using the Substack Scraper on Apify for production workloads, the data can power valuable insights into the newsletter ecosystem.

Happy scraping!

DEV Community