Reddit is one of the richest sources of user-generated content on the internet. With thousands of active communities, it's a goldmine for sentiment analysis, market research, trend detection, and competitive intelligence. Here's how to extract Reddit data with Python.
Why Mine Reddit Data?
- Sentiment analysis: Gauge public opinion on products, brands, or topics
- Market research: Discover pain points and feature requests
- Trend detection: Spot emerging topics before they hit mainstream
- Content ideas: Find what questions people are asking in your niche
- Academic research: Study community dynamics and information spread
Method 1: Reddit's JSON API
Reddit offers a simple JSON API by appending .json to any URL:
import requests
import time
HEADERS = {"User-Agent": "DataCollector/1.0 (research project)"}
def get_subreddit_posts(subreddit, sort="hot", limit=100):
"""Fetch posts from a subreddit using Reddit's JSON API."""
posts = []
after = None
while len(posts) < limit:
url = f"https://www.reddit.com/r/{subreddit}/{sort}.json"
params = {"limit": 25, "after": after}
response = requests.get(url, headers=HEADERS, params=params)
if response.status_code == 429:
print("Rate limited, waiting 60s...")
time.sleep(60)
continue
if response.status_code != 200:
print(f"Error: {response.status_code}")
break
data = response.json()
children = data["data"]["children"]
if not children:
break
for child in children:
post = child["data"]
posts.append({
"title": post["title"],
"author": post["author"],
"score": post["score"],
"upvote_ratio": post["upvote_ratio"],
"num_comments": post["num_comments"],
"created_utc": post["created_utc"],
"url": post["url"],
"selftext": post["selftext"][:500],
"permalink": f"https://reddit.com{post['permalink']}",
"is_self": post["is_self"],
"flair": post.get("link_flair_text", ""),
})
after = data["data"]["after"]
if not after:
break
time.sleep(2) # Respect rate limits
return posts[:limit]
Method 2: PRAW (Python Reddit API Wrapper)
import praw
reddit = praw.Reddit(
client_id="YOUR_CLIENT_ID",
client_secret="YOUR_CLIENT_SECRET",
user_agent="DataCollector/1.0"
)
def get_posts_with_comments(subreddit_name, limit=50):
"""Get posts and their top comments."""
subreddit = reddit.subreddit(subreddit_name)
results = []
for post in subreddit.hot(limit=limit):
post.comments.replace_more(limit=0)
top_comments = []
for comment in post.comments[:10]:
top_comments.append({
"author": str(comment.author),
"body": comment.body[:300],
"score": comment.score,
})
results.append({
"title": post.title,
"score": post.score,
"comments": top_comments,
"url": post.url,
})
return results
Scraping Comments at Scale
def scrape_post_comments(permalink):
"""Get all comments from a specific post."""
url = f"https://www.reddit.com{permalink}.json"
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
return []
data = response.json()
comments = []
def parse_comments(comment_data, depth=0):
if not isinstance(comment_data, dict):
return
if comment_data.get("kind") != "t1":
return
c = comment_data["data"]
comments.append({
"author": c.get("author", "[deleted]"),
"body": c.get("body", ""),
"score": c.get("score", 0),
"created_utc": c.get("created_utc", 0),
"depth": depth,
})
# Recurse into replies
replies = c.get("replies", "")
if isinstance(replies, dict):
for reply in replies.get("data", {}).get("children", []):
parse_comments(reply, depth + 1)
# Parse top-level comments
if len(data) > 1:
for child in data[1]["data"]["children"]:
parse_comments(child)
return comments
Searching Across Subreddits
def search_reddit(query, subreddit=None, sort="relevance", time_filter="month", limit=100):
"""Search Reddit for specific terms."""
if subreddit:
url = f"https://www.reddit.com/r/{subreddit}/search.json"
else:
url = "https://www.reddit.com/search.json"
params = {
"q": query,
"sort": sort,
"t": time_filter,
"limit": 25,
}
if subreddit:
params["restrict_sr"] = "true"
results = []
after = None
while len(results) < limit:
params["after"] = after
response = requests.get(url, headers=HEADERS, params=params)
if response.status_code != 200:
break
data = response.json()
children = data["data"]["children"]
for child in children:
post = child["data"]
results.append({
"title": post["title"],
"subreddit": post["subreddit"],
"score": post["score"],
"num_comments": post["num_comments"],
"permalink": f"https://reddit.com{post['permalink']}",
})
after = data["data"]["after"]
if not after:
break
time.sleep(2)
return results[:limit]
Production-Ready Reddit Scraping
For large-scale Reddit data mining — monitoring dozens of subreddits, tracking brand mentions, or building training datasets — the Reddit Scraper on Apify provides a robust solution with built-in pagination, rate limit handling, and proxy rotation.
For reliable proxy infrastructure that works well with Reddit, ThorData offers residential proxies that avoid Reddit's IP-based blocks.
Sentiment Analysis Example
from collections import Counter
def analyze_sentiment(posts):
"""Basic sentiment analysis using score and engagement."""
total_score = sum(p["score"] for p in posts)
avg_score = total_score / len(posts) if posts else 0
avg_comments = sum(p["num_comments"] for p in posts) / len(posts) if posts else 0
# Find most discussed topics
all_words = " ".join(p["title"].lower() for p in posts).split()
common_words = Counter(all_words).most_common(20)
return {
"total_posts": len(posts),
"avg_score": round(avg_score, 1),
"avg_comments": round(avg_comments, 1),
"top_keywords": common_words,
"highest_scored": sorted(posts, key=lambda x: x["score"], reverse=True)[:5],
}
Best Practices
- Use a descriptive User-Agent: Reddit blocks generic user agents
- Rate limit: Maximum 60 requests per minute — use 2-second delays
- Use ThorData proxies for large-scale collection to avoid IP blocks
- Cache results: Reddit data doesn't change frequently
- Respect deleted content: Don't try to recover [deleted] posts
Conclusion
Reddit is one of the best platforms for mining community insights and sentiment data. From the simple JSON API for small projects to the Reddit Scraper on Apify for production workloads, there's an approach for every scale.
Happy mining!
Top comments (0)