DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Medium Articles and Author Stats with Python

Medium hosts millions of articles with engagement data. Scraping it helps analyze content strategies and benchmark authors.

Hidden JSON Endpoints

Append ?format=json to URLs. Response has anti-XSRF prefix ]}while(1);</x>:

import requests, json, time
from datetime import datetime

class MediumScraper:
    def __init__(self):
        self.s = requests.Session()
        self.s.headers.update({"User-Agent":"Mozilla/5.0","Accept":"application/json"})

    def _json(self, text):
        pfx = "]}while(1);</x>"
        return json.loads(text[len(pfx):] if text.startswith(pfx) else text)

    def profile(self, user):
        d = self._json(self.s.get(f"https://medium.com/@{user}?format=json").text)
        u = d.get("payload",{}).get("user",{})
        return {"name":u.get("name"),"followers":u.get("socialStats",{}).get("followerCount",0)}

    def posts(self, user, limit=25):
        d = self._json(self.s.get(f"https://medium.com/@{user}/latest?format=json&limit={limit}").text)
        articles = []
        for pid, p in d.get("payload",{}).get("references",{}).get("Post",{}).items():
            v = p.get("virtuals",{})
            articles.append({
                "title":p.get("title",""),
                "url":f"https://medium.com/@{user}/{p.get('uniqueSlug','')}",
                "claps":v.get("totalClapCount",0),
                "read_time":v.get("readingTime",0),
                "tags":[t.get("slug") for t in v.get("tags",[])],
            })
        return sorted(articles, key=lambda x: x["claps"], reverse=True)
Enter fullscreen mode Exit fullscreen mode

Author Analysis

    def analyze(self, user):
        prof = self.profile(user)
        arts = self.posts(user, 50)
        if not arts: return
        total = sum(a["claps"] for a in arts)
        print(f"\n{prof['name']} | {prof['followers']:,} followers | {len(arts)} articles")
        print(f"Total claps: {total:,} | Avg: {total/len(arts):,.0f}")
        for a in arts[:5]:
            print(f"  {a['claps']:>6,} | {a['read_time']:.0f}min | {a['title'][:55]}")

        tags = {}
        for a in arts:
            for t in a["tags"]:
                tags.setdefault(t, {"c":0,"n":0})
                tags[t]["c"] += a["claps"]; tags[t]["n"] += 1
        for t, s in sorted(tags.items(), key=lambda x:x[1]["c"]/x[1]["n"], reverse=True)[:8]:
            print(f"  #{t}: {s['c']/s['n']:,.0f} avg ({s['n']} posts)")

s = MediumScraper()
for u in ["topolsky","nikitonsky"]:
    s.analyze(u); time.sleep(2)
Enter fullscreen mode Exit fullscreen mode

Scaling

ScraperAPI for rotation. ThorData for session proxies. ScrapeOps for throttle tracking.

Top comments (0)