Medium hosts millions of articles with engagement data. Scraping it helps analyze content strategies and benchmark authors.
Hidden JSON Endpoints
Append ?format=json to URLs. Response has anti-XSRF prefix ]}while(1);</x>:
import requests, json, time
from datetime import datetime
class MediumScraper:
def __init__(self):
self.s = requests.Session()
self.s.headers.update({"User-Agent":"Mozilla/5.0","Accept":"application/json"})
def _json(self, text):
pfx = "]}while(1);</x>"
return json.loads(text[len(pfx):] if text.startswith(pfx) else text)
def profile(self, user):
d = self._json(self.s.get(f"https://medium.com/@{user}?format=json").text)
u = d.get("payload",{}).get("user",{})
return {"name":u.get("name"),"followers":u.get("socialStats",{}).get("followerCount",0)}
def posts(self, user, limit=25):
d = self._json(self.s.get(f"https://medium.com/@{user}/latest?format=json&limit={limit}").text)
articles = []
for pid, p in d.get("payload",{}).get("references",{}).get("Post",{}).items():
v = p.get("virtuals",{})
articles.append({
"title":p.get("title",""),
"url":f"https://medium.com/@{user}/{p.get('uniqueSlug','')}",
"claps":v.get("totalClapCount",0),
"read_time":v.get("readingTime",0),
"tags":[t.get("slug") for t in v.get("tags",[])],
})
return sorted(articles, key=lambda x: x["claps"], reverse=True)
Author Analysis
def analyze(self, user):
prof = self.profile(user)
arts = self.posts(user, 50)
if not arts: return
total = sum(a["claps"] for a in arts)
print(f"\n{prof['name']} | {prof['followers']:,} followers | {len(arts)} articles")
print(f"Total claps: {total:,} | Avg: {total/len(arts):,.0f}")
for a in arts[:5]:
print(f" {a['claps']:>6,} | {a['read_time']:.0f}min | {a['title'][:55]}")
tags = {}
for a in arts:
for t in a["tags"]:
tags.setdefault(t, {"c":0,"n":0})
tags[t]["c"] += a["claps"]; tags[t]["n"] += 1
for t, s in sorted(tags.items(), key=lambda x:x[1]["c"]/x[1]["n"], reverse=True)[:8]:
print(f" #{t}: {s['c']/s['n']:,.0f} avg ({s['n']} posts)")
s = MediumScraper()
for u in ["topolsky","nikitonsky"]:
s.analyze(u); time.sleep(2)
Scaling
ScraperAPI for rotation. ThorData for session proxies. ScrapeOps for throttle tracking.
Top comments (0)