Reddit Data Mining: Scrape Posts, Comments, and Subreddits with Python

#webdev #python #webscraping #tutorial

Reddit is one of the richest sources of user-generated content on the internet. With thousands of active communities, it's a goldmine for sentiment analysis, market research, trend detection, and competitive intelligence. Here's how to extract Reddit data with Python.

Why Mine Reddit Data?

Sentiment analysis: Gauge public opinion on products, brands, or topics
Market research: Discover pain points and feature requests
Trend detection: Spot emerging topics before they hit mainstream
Content ideas: Find what questions people are asking in your niche
Academic research: Study community dynamics and information spread

Method 1: Reddit's JSON API

Reddit offers a simple JSON API by appending .json to any URL:

import requests
import time

HEADERS = {"User-Agent": "DataCollector/1.0 (research project)"}

def get_subreddit_posts(subreddit, sort="hot", limit=100):
    """Fetch posts from a subreddit using Reddit's JSON API."""
    posts = []
    after = None

    while len(posts) < limit:
        url = f"https://www.reddit.com/r/{subreddit}/{sort}.json"
        params = {"limit": 25, "after": after}

        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code == 429:
            print("Rate limited, waiting 60s...")
            time.sleep(60)
            continue

        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break

        data = response.json()
        children = data["data"]["children"]

        if not children:
            break

        for child in children:
            post = child["data"]
            posts.append({
                "title": post["title"],
                "author": post["author"],
                "score": post["score"],
                "upvote_ratio": post["upvote_ratio"],
                "num_comments": post["num_comments"],
                "created_utc": post["created_utc"],
                "url": post["url"],
                "selftext": post["selftext"][:500],
                "permalink": f"https://reddit.com{post['permalink']}",
                "is_self": post["is_self"],
                "flair": post.get("link_flair_text", ""),
            })

        after = data["data"]["after"]
        if not after:
            break

        time.sleep(2)  # Respect rate limits

    return posts[:limit]

Method 2: PRAW (Python Reddit API Wrapper)

import praw

reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="DataCollector/1.0"
)

def get_posts_with_comments(subreddit_name, limit=50):
    """Get posts and their top comments."""
    subreddit = reddit.subreddit(subreddit_name)
    results = []

    for post in subreddit.hot(limit=limit):
        post.comments.replace_more(limit=0)
        top_comments = []
        for comment in post.comments[:10]:
            top_comments.append({
                "author": str(comment.author),
                "body": comment.body[:300],
                "score": comment.score,
            })

        results.append({
            "title": post.title,
            "score": post.score,
            "comments": top_comments,
            "url": post.url,
        })

    return results

Scraping Comments at Scale

def scrape_post_comments(permalink):
    """Get all comments from a specific post."""
    url = f"https://www.reddit.com{permalink}.json"
    response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        return []

    data = response.json()
    comments = []

    def parse_comments(comment_data, depth=0):
        if not isinstance(comment_data, dict):
            return
        if comment_data.get("kind") != "t1":
            return

        c = comment_data["data"]
        comments.append({
            "author": c.get("author", "[deleted]"),
            "body": c.get("body", ""),
            "score": c.get("score", 0),
            "created_utc": c.get("created_utc", 0),
            "depth": depth,
        })

        # Recurse into replies
        replies = c.get("replies", "")
        if isinstance(replies, dict):
            for reply in replies.get("data", {}).get("children", []):
                parse_comments(reply, depth + 1)

    # Parse top-level comments
    if len(data) > 1:
        for child in data[1]["data"]["children"]:
            parse_comments(child)

    return comments

Searching Across Subreddits

def search_reddit(query, subreddit=None, sort="relevance", time_filter="month", limit=100):
    """Search Reddit for specific terms."""
    if subreddit:
        url = f"https://www.reddit.com/r/{subreddit}/search.json"
    else:
        url = "https://www.reddit.com/search.json"

    params = {
        "q": query,
        "sort": sort,
        "t": time_filter,
        "limit": 25,
    }
    if subreddit:
        params["restrict_sr"] = "true"

    results = []
    after = None

    while len(results) < limit:
        params["after"] = after
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            break

        data = response.json()
        children = data["data"]["children"]

        for child in children:
            post = child["data"]
            results.append({
                "title": post["title"],
                "subreddit": post["subreddit"],
                "score": post["score"],
                "num_comments": post["num_comments"],
                "permalink": f"https://reddit.com{post['permalink']}",
            })

        after = data["data"]["after"]
        if not after:
            break
        time.sleep(2)

    return results[:limit]

Production-Ready Reddit Scraping

For large-scale Reddit data mining — monitoring dozens of subreddits, tracking brand mentions, or building training datasets — the Reddit Scraper on Apify provides a robust solution with built-in pagination, rate limit handling, and proxy rotation.

For reliable proxy infrastructure that works well with Reddit, ThorData offers residential proxies that avoid Reddit's IP-based blocks.

Sentiment Analysis Example

from collections import Counter

def analyze_sentiment(posts):
    """Basic sentiment analysis using score and engagement."""
    total_score = sum(p["score"] for p in posts)
    avg_score = total_score / len(posts) if posts else 0
    avg_comments = sum(p["num_comments"] for p in posts) / len(posts) if posts else 0

    # Find most discussed topics
    all_words = " ".join(p["title"].lower() for p in posts).split()
    common_words = Counter(all_words).most_common(20)

    return {
        "total_posts": len(posts),
        "avg_score": round(avg_score, 1),
        "avg_comments": round(avg_comments, 1),
        "top_keywords": common_words,
        "highest_scored": sorted(posts, key=lambda x: x["score"], reverse=True)[:5],
    }

Best Practices

Use a descriptive User-Agent: Reddit blocks generic user agents
Rate limit: Maximum 60 requests per minute — use 2-second delays
Use ThorData proxies for large-scale collection to avoid IP blocks
Cache results: Reddit data doesn't change frequently
Respect deleted content: Don't try to recover [deleted] posts

Conclusion

Reddit is one of the best platforms for mining community insights and sentiment data. From the simple JSON API for small projects to the Reddit Scraper on Apify for production workloads, there's an approach for every scale.

Happy mining!

DEV Community