YouTube Data: What's Available and How to Get It
YouTube is the world's second-largest search engine with over 800 million videos. Whether you're building a content analytics tool, tracking brand mentions, analyzing comment sentiment, or researching trending topics, YouTube data is essential.
This guide covers the YouTube Data API, web scraping techniques, and practical code for extracting videos, comments, and channel statistics.
Method 1: YouTube Data API v3
The official API is the most reliable starting point. It's free (with quota limits of 10,000 units/day).
Setup
pip install google-api-python-client
from googleapiclient.discovery import build
import json
API_KEY = 'YOUR_YOUTUBE_API_KEY' # Get from console.cloud.google.com
youtube = build('youtube', 'v3', developerKey=API_KEY)
def search_videos(query, max_results=50):
videos = []
request = youtube.search().list(
part='snippet',
q=query,
maxResults=min(max_results, 50),
type='video',
order='relevance'
)
while request and len(videos) < max_results:
response = request.execute()
for item in response['items']:
videos.append({
'video_id': item['id']['videoId'],
'title': item['snippet']['title'],
'description': item['snippet']['description'],
'channel': item['snippet']['channelTitle'],
'channel_id': item['snippet']['channelId'],
'published_at': item['snippet']['publishedAt'],
'thumbnail': item['snippet']['thumbnails']['high']['url'],
})
request = youtube.search().list_next(request, response)
return videos[:max_results]
results = search_videos('python web scraping tutorial', max_results=20)
for v in results[:5]:
print(f"{v['title']} - {v['channel']}")
Get Video Statistics
def get_video_stats(video_ids):
stats = []
# API accepts up to 50 IDs per request
for i in range(0, len(video_ids), 50):
batch = video_ids[i:i+50]
request = youtube.videos().list(
part='statistics,contentDetails,snippet',
id=','.join(batch)
)
response = request.execute()
for item in response['items']:
s = item['statistics']
stats.append({
'video_id': item['id'],
'title': item['snippet']['title'],
'views': int(s.get('viewCount', 0)),
'likes': int(s.get('likeCount', 0)),
'comments': int(s.get('commentCount', 0)),
'duration': item['contentDetails']['duration'],
'published': item['snippet']['publishedAt'],
})
return stats
# Get stats for search results
ids = [v['video_id'] for v in results]
stats = get_video_stats(ids)
for s in sorted(stats, key=lambda x: x['views'], reverse=True)[:5]:
print(f"{s['title']}: {s['views']:,} views, {s['likes']:,} likes")
Extract Comments
def get_video_comments(video_id, max_comments=200):
comments = []
request = youtube.commentThreads().list(
part='snippet',
videoId=video_id,
maxResults=100,
order='relevance'
)
while request and len(comments) < max_comments:
try:
response = request.execute()
except Exception as e:
print(f'Comments disabled or error: {e}')
break
for item in response['items']:
comment = item['snippet']['topLevelComment']['snippet']
comments.append({
'author': comment['authorDisplayName'],
'text': comment['textDisplay'],
'likes': comment['likeCount'],
'published': comment['publishedAt'],
'reply_count': item['snippet']['totalReplyCount'],
})
request = youtube.commentThreads().list_next(request, response)
return comments[:max_comments]
comments = get_video_comments(results[0]['video_id'])
print(f'Got {len(comments)} comments')
for c in comments[:3]:
print(f" {c['author']}: {c['text'][:80]}... ({c['likes']} likes)")
Channel Analytics
def get_channel_stats(channel_id):
request = youtube.channels().list(
part='statistics,snippet,contentDetails',
id=channel_id
)
response = request.execute()
if not response['items']:
return None
item = response['items'][0]
stats = item['statistics']
return {
'name': item['snippet']['title'],
'description': item['snippet']['description'][:200],
'subscribers': int(stats.get('subscriberCount', 0)),
'total_views': int(stats.get('viewCount', 0)),
'video_count': int(stats.get('videoCount', 0)),
'created': item['snippet']['publishedAt'],
'playlist_id': item['contentDetails']['relatedPlaylists']['uploads'],
}
def get_channel_videos(playlist_id, max_videos=100):
videos = []
request = youtube.playlistItems().list(
part='snippet',
playlistId=playlist_id,
maxResults=50
)
while request and len(videos) < max_videos:
response = request.execute()
for item in response['items']:
videos.append({
'video_id': item['snippet']['resourceId']['videoId'],
'title': item['snippet']['title'],
'published': item['snippet']['publishedAt'],
})
request = youtube.playlistItems().list_next(request, response)
return videos[:max_videos]
Method 2: Web Scraping (When API Isn't Enough)
The API has daily quotas (10K units). For large-scale data collection, scraping supplements the API:
import requests
from bs4 import BeautifulSoup
import re
import json
def scrape_youtube_search(query, max_results=20):
url = f'https://www.youtube.com/results?search_query={query.replace(" ", "+")}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'en-US,en;q=0.9',
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# YouTube embeds data in a script tag
scripts = soup.find_all('script')
for script in scripts:
if script.string and 'var ytInitialData' in (script.string or ''):
json_text = script.string.split('var ytInitialData = ')[1].rstrip(';')
data = json.loads(json_text)
# Navigate the nested structure
contents = (data
.get('contents', {})
.get('twoColumnSearchResultsRenderer', {})
.get('primaryContents', {})
.get('sectionListRenderer', {})
.get('contents', [{}])[0]
.get('itemSectionRenderer', {})
.get('contents', []))
videos = []
for item in contents:
renderer = item.get('videoRenderer', {})
if renderer:
title = renderer.get('title', {}).get('runs', [{}])[0].get('text', '')
video_id = renderer.get('videoId', '')
view_text = renderer.get('viewCountText', {}).get('simpleText', '0')
videos.append({
'video_id': video_id,
'title': title,
'views': view_text,
'url': f'https://youtube.com/watch?v={video_id}',
})
return videos[:max_results]
return []
Building a YouTube Analytics Pipeline
import pandas as pd
import sqlite3
class YouTubeAnalytics:
def __init__(self, api_key, db_path='youtube_data.db'):
self.youtube = build('youtube', 'v3', developerKey=api_key)
self.conn = sqlite3.connect(db_path)
self.setup_db()
def setup_db(self):
self.conn.executescript('''
CREATE TABLE IF NOT EXISTS videos (
video_id TEXT PRIMARY KEY, title TEXT, channel TEXT,
views INTEGER, likes INTEGER, comments INTEGER,
published TEXT, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS channels (
channel_id TEXT PRIMARY KEY, name TEXT,
subscribers INTEGER, total_views INTEGER, video_count INTEGER,
collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
''')
def track_topic(self, query, max_videos=50):
videos = search_videos(query, max_videos)
ids = [v['video_id'] for v in videos]
stats = get_video_stats(ids)
for s in stats:
self.conn.execute('''
INSERT OR REPLACE INTO videos
(video_id, title, views, likes, comments, published)
VALUES (?, ?, ?, ?, ?, ?)
''', (s['video_id'], s['title'], s['views'], s['likes'],
s['comments'], s['published']))
self.conn.commit()
return stats
analytics = YouTubeAnalytics('YOUR_API_KEY')
analytics.track_topic('web scraping python 2026')
The Managed Approach
For production-scale YouTube data collection that goes beyond API quotas, the YouTube Scraper on Apify handles large-scale video, comment, and channel data extraction with built-in proxy rotation and anti-bot handling.
Handling API Quotas
YouTube API quota costs:
- Search: 100 units per request
- Videos list: 1 unit per request
- Comments: 1 unit per request
- Channels: 1 unit per request
With 10K daily units, you can make ~100 searches or ~10K stat lookups. Plan accordingly.
For high-volume needs, use ScraperAPI to supplement the official API with web scraping that handles YouTube's JavaScript rendering.
Conclusion
YouTube data extraction works best as a hybrid approach — use the official API for structured queries within quota, and supplement with web scraping or managed tools like the YouTube Scraper on Apify for larger-scale needs. For reliable proxy infrastructure, ScraperAPI handles YouTube's anti-bot measures effectively.
Start with the API, hit the quota limits, then decide whether to scrape or use managed infrastructure.
Top comments (0)