How to Scrape YouTube: Videos, Comments, and Channel Analytics

#python #tutorial #webdev #programming

YouTube is a goldmine of data for market researchers and content creators. From video metadata to channel performance metrics, there's enormous value in structured YouTube data.

Using the YouTube Data API

import requests

class YouTubeScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def search(self, query, max_results=50):
        resp = requests.get(f'{self.base}/search', params={
            'part': 'snippet', 'q': query, 'type': 'video',
            'maxResults': min(max_results, 50), 'key': self.key
        })
        return [{
            'id': i['id']['videoId'],
            'title': i['snippet']['title'],
            'channel': i['snippet']['channelTitle'],
            'published': i['snippet']['publishedAt']
        } for i in resp.json().get('items', [])]

    def get_stats(self, video_ids):
        resp = requests.get(f'{self.base}/videos', params={
            'part': 'statistics', 'id': ','.join(video_ids), 'key': self.key
        })
        return {
            i['id']: {
                'views': int(i['statistics'].get('viewCount', 0)),
                'likes': int(i['statistics'].get('likeCount', 0)),
                'comments': int(i['statistics'].get('commentCount', 0))
            } for i in resp.json().get('items', [])
        }

Scraping Comments

class CommentScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def get_comments(self, video_id, max_comments=200):
        comments, page_token = [], None
        while len(comments) < max_comments:
            params = {
                'part': 'snippet', 'videoId': video_id,
                'maxResults': 100, 'key': self.key
            }
            if page_token: params['pageToken'] = page_token
            resp = requests.get(f'{self.base}/commentThreads', params=params)
            data = resp.json()
            for item in data.get('items', []):
                c = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'author': c['authorDisplayName'],
                    'text': c['textDisplay'],
                    'likes': c['likeCount']
                })
            page_token = data.get('nextPageToken')
            if not page_token: break
        return comments[:max_comments]

Channel Analytics

class ChannelAnalyzer:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def get_channel(self, channel_id):
        resp = requests.get(f'{self.base}/channels', params={
            'part': 'snippet,statistics,contentDetails',
            'id': channel_id, 'key': self.key
        })
        ch = resp.json()['items'][0]
        s = ch['statistics']
        return {
            'name': ch['snippet']['title'],
            'subs': int(s.get('subscriberCount', 0)),
            'views': int(s.get('viewCount', 0)),
            'videos': int(s.get('videoCount', 0)),
            'uploads': ch['contentDetails']['relatedPlaylists']['uploads']
        }

Simple Sentiment Analysis

import re

def analyze_sentiment(comments):
    pos_words = {'love', 'great', 'awesome', 'amazing', 'excellent', 'best'}
    neg_words = {'hate', 'bad', 'worst', 'terrible', 'boring', 'awful'}
    results = {'positive': 0, 'negative': 0, 'neutral': 0}
    for c in comments:
        words = set(re.findall(r'\w+', c['text'].lower()))
        p, n = len(words & pos_words), len(words & neg_words)
        if p > n: results['positive'] += 1
        elif n > p: results['negative'] += 1
        else: results['neutral'] += 1
    return results

Scaling

When you need to go beyond API quotas, ScraperAPI handles anti-bot protection. ThorData provides residential proxies, and ScrapeOps monitors your infrastructure.