Scraping Twitch: Stream Data, Viewers, and Clip Analytics

#tutorial #python #programming #webdev

Scraping Twitch: Stream Data, Viewers, and Clip Analytics

Twitch dominates live streaming with 140+ million monthly active users. Whether building analytics dashboards or tracking content trends, Twitch data is incredibly valuable.

Setting Up the Data Collector

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Collecting via Helix API

    def get_streams(self, game_id=None, lang='en', first=100):
        params = {'first': first, 'language': lang}
        if game_id: params['game_id'] = game_id
        resp = self.api.get('https://api.twitch.tv/helix/streams', params=params)
        streams = resp.json().get('data', [])
        for s in streams:
            self.db.execute(
                'INSERT INTO streams (channel,game,title,viewers) VALUES (?,?,?,?)',
                (s['user_name'], s['game_name'], s['title'], s['viewer_count']))
        self.db.commit()
        return streams

    def get_clips(self, broadcaster_id, first=20):
        resp = self.api.get('https://api.twitch.tv/helix/clips',
                           params={'broadcaster_id': broadcaster_id, 'first': first})
        clips = resp.json().get('data', [])
        for c in clips:
            self.db.execute('''INSERT OR REPLACE INTO clips
                (id,channel,title,views,duration,created_at,game)
                VALUES (?,?,?,?,?,?,?)''',
                (c['id'], c['broadcaster_name'], c['title'],
                 c['view_count'], c['duration'], c['created_at'], c['game_id']))
        self.db.commit()
        return clips

Scraping Category Trends

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Analytics Reports

class TwitchAnalytics:
    def __init__(self, db):
        self.db = db

    def growth_report(self, channel, days=30):
        cursor = self.db.execute('''
            SELECT DATE(timestamp) as d, AVG(viewers), MAX(viewers), COUNT(*)
            FROM streams WHERE channel=? AND timestamp > datetime('now',?)
            GROUP BY d ORDER BY d
        ''', (channel, f'-{days} days'))
        return [{'date': r[0], 'avg': round(r[1]), 'peak': r[2], 'samples': r[3]}
                for r in cursor.fetchall()]

    def trending(self, hours=6):
        cursor = self.db.execute('''
            SELECT game, SUM(viewers), COUNT(DISTINCT channel), AVG(viewers)
            FROM streams WHERE timestamp > datetime('now',?)
            GROUP BY game ORDER BY SUM(viewers) DESC LIMIT 20
        ''', (f'-{hours} hours',))
        return [{'game': r[0], 'total': r[1], 'streams': r[2], 'avg': round(r[3])}
                for r in cursor.fetchall()]

Continuous Monitoring

def monitor(collector, channels, interval=300):
    analytics = TwitchAnalytics(collector.db)
    while True:
        streams = collector.get_streams()
        print(f"[{datetime.now()}] Tracked {len(streams)} streams")
        for ch in channels:
            clips = collector.get_clips(ch)
            print(f"  {ch}: {len(clips)} clips")
        trending = analytics.trending()
        print("Top:", [t['game'] for t in trending[:5]])
        time.sleep(interval)