DEV Community

agenthustler
agenthustler

Posted on

How to Scrape YouTube: Videos, Comments, and Channel Analytics

YouTube is a goldmine of data for market researchers and content creators. From video metadata to channel performance metrics, there's enormous value in structured YouTube data.

Using the YouTube Data API

import requests

class YouTubeScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def search(self, query, max_results=50):
        resp = requests.get(f'{self.base}/search', params={
            'part': 'snippet', 'q': query, 'type': 'video',
            'maxResults': min(max_results, 50), 'key': self.key
        })
        return [{
            'id': i['id']['videoId'],
            'title': i['snippet']['title'],
            'channel': i['snippet']['channelTitle'],
            'published': i['snippet']['publishedAt']
        } for i in resp.json().get('items', [])]

    def get_stats(self, video_ids):
        resp = requests.get(f'{self.base}/videos', params={
            'part': 'statistics', 'id': ','.join(video_ids), 'key': self.key
        })
        return {
            i['id']: {
                'views': int(i['statistics'].get('viewCount', 0)),
                'likes': int(i['statistics'].get('likeCount', 0)),
                'comments': int(i['statistics'].get('commentCount', 0))
            } for i in resp.json().get('items', [])
        }
Enter fullscreen mode Exit fullscreen mode

Scraping Comments

class CommentScraper:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def get_comments(self, video_id, max_comments=200):
        comments, page_token = [], None
        while len(comments) < max_comments:
            params = {
                'part': 'snippet', 'videoId': video_id,
                'maxResults': 100, 'key': self.key
            }
            if page_token: params['pageToken'] = page_token
            resp = requests.get(f'{self.base}/commentThreads', params=params)
            data = resp.json()
            for item in data.get('items', []):
                c = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'author': c['authorDisplayName'],
                    'text': c['textDisplay'],
                    'likes': c['likeCount']
                })
            page_token = data.get('nextPageToken')
            if not page_token: break
        return comments[:max_comments]
Enter fullscreen mode Exit fullscreen mode

Channel Analytics

class ChannelAnalyzer:
    def __init__(self, api_key):
        self.key = api_key
        self.base = 'https://www.googleapis.com/youtube/v3'

    def get_channel(self, channel_id):
        resp = requests.get(f'{self.base}/channels', params={
            'part': 'snippet,statistics,contentDetails',
            'id': channel_id, 'key': self.key
        })
        ch = resp.json()['items'][0]
        s = ch['statistics']
        return {
            'name': ch['snippet']['title'],
            'subs': int(s.get('subscriberCount', 0)),
            'views': int(s.get('viewCount', 0)),
            'videos': int(s.get('videoCount', 0)),
            'uploads': ch['contentDetails']['relatedPlaylists']['uploads']
        }
Enter fullscreen mode Exit fullscreen mode

Simple Sentiment Analysis

import re

def analyze_sentiment(comments):
    pos_words = {'love', 'great', 'awesome', 'amazing', 'excellent', 'best'}
    neg_words = {'hate', 'bad', 'worst', 'terrible', 'boring', 'awful'}
    results = {'positive': 0, 'negative': 0, 'neutral': 0}
    for c in comments:
        words = set(re.findall(r'\w+', c['text'].lower()))
        p, n = len(words & pos_words), len(words & neg_words)
        if p > n: results['positive'] += 1
        elif n > p: results['negative'] += 1
        else: results['neutral'] += 1
    return results
Enter fullscreen mode Exit fullscreen mode

Scaling

When you need to go beyond API quotas, ScraperAPI handles anti-bot protection. ThorData provides residential proxies, and ScrapeOps monitors your infrastructure.

Conclusion

Start with the official API for structured data, complement with web scraping when needed, and always respect rate limits.

Top comments (0)