How to Scrape Medium Articles and Author Stats with Python

#python #tutorial #webdev #programming

Medium hosts millions of articles with engagement data. Scraping it helps analyze content strategies and benchmark authors.

Hidden JSON Endpoints

Append ?format=json to URLs. Response has anti-XSRF prefix ]}while(1);</x>:

import requests, json, time
from datetime import datetime

class MediumScraper:
    def __init__(self):
        self.s = requests.Session()
        self.s.headers.update({"User-Agent":"Mozilla/5.0","Accept":"application/json"})

    def _json(self, text):
        pfx = "]}while(1);</x>"
        return json.loads(text[len(pfx):] if text.startswith(pfx) else text)

    def profile(self, user):
        d = self._json(self.s.get(f"https://medium.com/@{user}?format=json").text)
        u = d.get("payload",{}).get("user",{})
        return {"name":u.get("name"),"followers":u.get("socialStats",{}).get("followerCount",0)}

    def posts(self, user, limit=25):
        d = self._json(self.s.get(f"https://medium.com/@{user}/latest?format=json&limit={limit}").text)
        articles = []
        for pid, p in d.get("payload",{}).get("references",{}).get("Post",{}).items():
            v = p.get("virtuals",{})
            articles.append({
                "title":p.get("title",""),
                "url":f"https://medium.com/@{user}/{p.get('uniqueSlug','')}",
                "claps":v.get("totalClapCount",0),
                "read_time":v.get("readingTime",0),
                "tags":[t.get("slug") for t in v.get("tags",[])],
            })
        return sorted(articles, key=lambda x: x["claps"], reverse=True)

Author Analysis

    def analyze(self, user):
        prof = self.profile(user)
        arts = self.posts(user, 50)
        if not arts: return
        total = sum(a["claps"] for a in arts)
        print(f"\n{prof['name']} | {prof['followers']:,} followers | {len(arts)} articles")
        print(f"Total claps: {total:,} | Avg: {total/len(arts):,.0f}")
        for a in arts[:5]:
            print(f"  {a['claps']:>6,} | {a['read_time']:.0f}min | {a['title'][:55]}")

        tags = {}
        for a in arts:
            for t in a["tags"]:
                tags.setdefault(t, {"c":0,"n":0})
                tags[t]["c"] += a["claps"]; tags[t]["n"] += 1
        for t, s in sorted(tags.items(), key=lambda x:x[1]["c"]/x[1]["n"], reverse=True)[:8]:
            print(f"  #{t}: {s['c']/s['n']:,.0f} avg ({s['n']} posts)")

s = MediumScraper()
for u in ["topolsky","nikitonsky"]:
    s.analyze(u); time.sleep(2)