YouTube is a goldmine of data for market researchers and content creators. From video metadata to channel performance metrics, there's enormous value in structured YouTube data.
Using the YouTube Data API
import requests
class YouTubeScraper:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://www.googleapis.com/youtube/v3'
def search(self, query, max_results=50):
resp = requests.get(f'{self.base}/search', params={
'part': 'snippet', 'q': query, 'type': 'video',
'maxResults': min(max_results, 50), 'key': self.key
})
return [{
'id': i['id']['videoId'],
'title': i['snippet']['title'],
'channel': i['snippet']['channelTitle'],
'published': i['snippet']['publishedAt']
} for i in resp.json().get('items', [])]
def get_stats(self, video_ids):
resp = requests.get(f'{self.base}/videos', params={
'part': 'statistics', 'id': ','.join(video_ids), 'key': self.key
})
return {
i['id']: {
'views': int(i['statistics'].get('viewCount', 0)),
'likes': int(i['statistics'].get('likeCount', 0)),
'comments': int(i['statistics'].get('commentCount', 0))
} for i in resp.json().get('items', [])
}
Scraping Comments
class CommentScraper:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://www.googleapis.com/youtube/v3'
def get_comments(self, video_id, max_comments=200):
comments, page_token = [], None
while len(comments) < max_comments:
params = {
'part': 'snippet', 'videoId': video_id,
'maxResults': 100, 'key': self.key
}
if page_token: params['pageToken'] = page_token
resp = requests.get(f'{self.base}/commentThreads', params=params)
data = resp.json()
for item in data.get('items', []):
c = item['snippet']['topLevelComment']['snippet']
comments.append({
'author': c['authorDisplayName'],
'text': c['textDisplay'],
'likes': c['likeCount']
})
page_token = data.get('nextPageToken')
if not page_token: break
return comments[:max_comments]
Channel Analytics
class ChannelAnalyzer:
def __init__(self, api_key):
self.key = api_key
self.base = 'https://www.googleapis.com/youtube/v3'
def get_channel(self, channel_id):
resp = requests.get(f'{self.base}/channels', params={
'part': 'snippet,statistics,contentDetails',
'id': channel_id, 'key': self.key
})
ch = resp.json()['items'][0]
s = ch['statistics']
return {
'name': ch['snippet']['title'],
'subs': int(s.get('subscriberCount', 0)),
'views': int(s.get('viewCount', 0)),
'videos': int(s.get('videoCount', 0)),
'uploads': ch['contentDetails']['relatedPlaylists']['uploads']
}
Simple Sentiment Analysis
import re
def analyze_sentiment(comments):
pos_words = {'love', 'great', 'awesome', 'amazing', 'excellent', 'best'}
neg_words = {'hate', 'bad', 'worst', 'terrible', 'boring', 'awful'}
results = {'positive': 0, 'negative': 0, 'neutral': 0}
for c in comments:
words = set(re.findall(r'\w+', c['text'].lower()))
p, n = len(words & pos_words), len(words & neg_words)
if p > n: results['positive'] += 1
elif n > p: results['negative'] += 1
else: results['neutral'] += 1
return results
Scaling
When you need to go beyond API quotas, ScraperAPI handles anti-bot protection. ThorData provides residential proxies, and ScrapeOps monitors your infrastructure.
Conclusion
Start with the official API for structured data, complement with web scraping when needed, and always respect rate limits.
Top comments (0)