DEV Community

agenthustler
agenthustler

Posted on

How to Scrape YouTube Data: Videos, Comments, and Channel Stats in 2026

YouTube Data: What's Available and How to Get It

YouTube is the world's second-largest search engine with over 800 million videos. Whether you're building a content analytics tool, tracking brand mentions, analyzing comment sentiment, or researching trending topics, YouTube data is essential.

This guide covers the YouTube Data API, web scraping techniques, and practical code for extracting videos, comments, and channel statistics.

Method 1: YouTube Data API v3

The official API is the most reliable starting point. It's free (with quota limits of 10,000 units/day).

Setup

pip install google-api-python-client
Enter fullscreen mode Exit fullscreen mode
from googleapiclient.discovery import build
import json

API_KEY = 'YOUR_YOUTUBE_API_KEY'  # Get from console.cloud.google.com
youtube = build('youtube', 'v3', developerKey=API_KEY)

def search_videos(query, max_results=50):
    videos = []
    request = youtube.search().list(
        part='snippet',
        q=query,
        maxResults=min(max_results, 50),
        type='video',
        order='relevance'
    )

    while request and len(videos) < max_results:
        response = request.execute()

        for item in response['items']:
            videos.append({
                'video_id': item['id']['videoId'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'channel': item['snippet']['channelTitle'],
                'channel_id': item['snippet']['channelId'],
                'published_at': item['snippet']['publishedAt'],
                'thumbnail': item['snippet']['thumbnails']['high']['url'],
            })

        request = youtube.search().list_next(request, response)

    return videos[:max_results]

results = search_videos('python web scraping tutorial', max_results=20)
for v in results[:5]:
    print(f"{v['title']} - {v['channel']}")
Enter fullscreen mode Exit fullscreen mode

Get Video Statistics

def get_video_stats(video_ids):
    stats = []

    # API accepts up to 50 IDs per request
    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i+50]
        request = youtube.videos().list(
            part='statistics,contentDetails,snippet',
            id=','.join(batch)
        )
        response = request.execute()

        for item in response['items']:
            s = item['statistics']
            stats.append({
                'video_id': item['id'],
                'title': item['snippet']['title'],
                'views': int(s.get('viewCount', 0)),
                'likes': int(s.get('likeCount', 0)),
                'comments': int(s.get('commentCount', 0)),
                'duration': item['contentDetails']['duration'],
                'published': item['snippet']['publishedAt'],
            })

    return stats

# Get stats for search results
ids = [v['video_id'] for v in results]
stats = get_video_stats(ids)
for s in sorted(stats, key=lambda x: x['views'], reverse=True)[:5]:
    print(f"{s['title']}: {s['views']:,} views, {s['likes']:,} likes")
Enter fullscreen mode Exit fullscreen mode

Extract Comments

def get_video_comments(video_id, max_comments=200):
    comments = []

    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=100,
        order='relevance'
    )

    while request and len(comments) < max_comments:
        try:
            response = request.execute()
        except Exception as e:
            print(f'Comments disabled or error: {e}')
            break

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append({
                'author': comment['authorDisplayName'],
                'text': comment['textDisplay'],
                'likes': comment['likeCount'],
                'published': comment['publishedAt'],
                'reply_count': item['snippet']['totalReplyCount'],
            })

        request = youtube.commentThreads().list_next(request, response)

    return comments[:max_comments]

comments = get_video_comments(results[0]['video_id'])
print(f'Got {len(comments)} comments')
for c in comments[:3]:
    print(f"  {c['author']}: {c['text'][:80]}... ({c['likes']} likes)")
Enter fullscreen mode Exit fullscreen mode

Channel Analytics

def get_channel_stats(channel_id):
    request = youtube.channels().list(
        part='statistics,snippet,contentDetails',
        id=channel_id
    )
    response = request.execute()

    if not response['items']:
        return None

    item = response['items'][0]
    stats = item['statistics']

    return {
        'name': item['snippet']['title'],
        'description': item['snippet']['description'][:200],
        'subscribers': int(stats.get('subscriberCount', 0)),
        'total_views': int(stats.get('viewCount', 0)),
        'video_count': int(stats.get('videoCount', 0)),
        'created': item['snippet']['publishedAt'],
        'playlist_id': item['contentDetails']['relatedPlaylists']['uploads'],
    }

def get_channel_videos(playlist_id, max_videos=100):
    videos = []
    request = youtube.playlistItems().list(
        part='snippet',
        playlistId=playlist_id,
        maxResults=50
    )

    while request and len(videos) < max_videos:
        response = request.execute()
        for item in response['items']:
            videos.append({
                'video_id': item['snippet']['resourceId']['videoId'],
                'title': item['snippet']['title'],
                'published': item['snippet']['publishedAt'],
            })
        request = youtube.playlistItems().list_next(request, response)

    return videos[:max_videos]
Enter fullscreen mode Exit fullscreen mode

Method 2: Web Scraping (When API Isn't Enough)

The API has daily quotas (10K units). For large-scale data collection, scraping supplements the API:

import requests
from bs4 import BeautifulSoup
import re
import json

def scrape_youtube_search(query, max_results=20):
    url = f'https://www.youtube.com/results?search_query={query.replace(" ", "+")}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # YouTube embeds data in a script tag
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string and 'var ytInitialData' in (script.string or ''):
            json_text = script.string.split('var ytInitialData = ')[1].rstrip(';')
            data = json.loads(json_text)

            # Navigate the nested structure
            contents = (data
                .get('contents', {})
                .get('twoColumnSearchResultsRenderer', {})
                .get('primaryContents', {})
                .get('sectionListRenderer', {})
                .get('contents', [{}])[0]
                .get('itemSectionRenderer', {})
                .get('contents', []))

            videos = []
            for item in contents:
                renderer = item.get('videoRenderer', {})
                if renderer:
                    title = renderer.get('title', {}).get('runs', [{}])[0].get('text', '')
                    video_id = renderer.get('videoId', '')
                    view_text = renderer.get('viewCountText', {}).get('simpleText', '0')

                    videos.append({
                        'video_id': video_id,
                        'title': title,
                        'views': view_text,
                        'url': f'https://youtube.com/watch?v={video_id}',
                    })

            return videos[:max_results]

    return []
Enter fullscreen mode Exit fullscreen mode

Building a YouTube Analytics Pipeline

import pandas as pd
import sqlite3

class YouTubeAnalytics:
    def __init__(self, api_key, db_path='youtube_data.db'):
        self.youtube = build('youtube', 'v3', developerKey=api_key)
        self.conn = sqlite3.connect(db_path)
        self.setup_db()

    def setup_db(self):
        self.conn.executescript('''
            CREATE TABLE IF NOT EXISTS videos (
                video_id TEXT PRIMARY KEY, title TEXT, channel TEXT,
                views INTEGER, likes INTEGER, comments INTEGER,
                published TEXT, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS channels (
                channel_id TEXT PRIMARY KEY, name TEXT,
                subscribers INTEGER, total_views INTEGER, video_count INTEGER,
                collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        ''')

    def track_topic(self, query, max_videos=50):
        videos = search_videos(query, max_videos)
        ids = [v['video_id'] for v in videos]
        stats = get_video_stats(ids)

        for s in stats:
            self.conn.execute('''
                INSERT OR REPLACE INTO videos 
                (video_id, title, views, likes, comments, published)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (s['video_id'], s['title'], s['views'], s['likes'], 
                  s['comments'], s['published']))

        self.conn.commit()
        return stats

analytics = YouTubeAnalytics('YOUR_API_KEY')
analytics.track_topic('web scraping python 2026')
Enter fullscreen mode Exit fullscreen mode

The Managed Approach

For production-scale YouTube data collection that goes beyond API quotas, the YouTube Scraper on Apify handles large-scale video, comment, and channel data extraction with built-in proxy rotation and anti-bot handling.

Handling API Quotas

YouTube API quota costs:

  • Search: 100 units per request
  • Videos list: 1 unit per request
  • Comments: 1 unit per request
  • Channels: 1 unit per request

With 10K daily units, you can make ~100 searches or ~10K stat lookups. Plan accordingly.

For high-volume needs, use ScraperAPI to supplement the official API with web scraping that handles YouTube's JavaScript rendering.

Conclusion

YouTube data extraction works best as a hybrid approach — use the official API for structured queries within quota, and supplement with web scraping or managed tools like the YouTube Scraper on Apify for larger-scale needs. For reliable proxy infrastructure, ScraperAPI handles YouTube's anti-bot measures effectively.

Start with the API, hit the quota limits, then decide whether to scrape or use managed infrastructure.

Top comments (0)