TikTok's rapid growth makes it a prime target for data analysis. This guide covers practical approaches to collecting TikTok data for research and analytics.
The Challenge with TikTok
TikTok has aggressive anti-scraping measures:
- Heavy JavaScript rendering
- Device fingerprinting
- Encrypted API parameters
- Frequent anti-bot updates
Approach 1: Web Endpoint Data Extraction
TikTok's web app embeds data you can extract:
import requests, re, json, time
class TikTokScraper:
BASE_URL = "https://www.tiktok.com"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Referer": "https://www.tiktok.com/",
})
def get_user_info(self, username):
url = f"{self.BASE_URL}/@{username}"
response = self.session.get(url)
if '__UNIVERSAL_DATA_FOR_REHYDRATION__' in response.text:
match = re.search(
r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
response.text
)
if match:
return json.loads(match.group(1))
return None
Approach 2: Playwright for Dynamic Content
from playwright.sync_api import sync_playwright
import time
def scrape_tiktok_profile(username, max_videos=20):
videos = []
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
page.goto(f"https://www.tiktok.com/@{username}", wait_until="networkidle")
time.sleep(5)
for _ in range(3):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(2)
video_elements = page.query_selector_all('[data-e2e="user-post-item"]')
for el in video_elements[:max_videos]:
link = el.query_selector("a")
desc = el.query_selector('[data-e2e="video-desc"]')
videos.append({
"url": link.get_attribute("href") if link else "",
"description": desc.inner_text() if desc else ""
})
browser.close()
return videos
Scraping Trending Content
def scrape_trending_hashtags():
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True)
page = browser.new_page()
page.goto("https://www.tiktok.com/discover", wait_until="networkidle")
time.sleep(5)
hashtags = []
elements = page.query_selector_all('[data-e2e="trending-item"]')
for el in elements:
name = el.query_selector("h3")
views = el.query_selector("span")
if name:
hashtags.append({
"hashtag": name.inner_text(),
"views": views.inner_text() if views else "N/A"
})
browser.close()
return hashtags
Data Storage and Analysis
import csv
from datetime import datetime
from collections import Counter
def save_tiktok_data(videos, filename="tiktok_data.csv"):
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["url", "description", "scraped_at"])
writer.writeheader()
for video in videos:
video["scraped_at"] = datetime.now().isoformat()
writer.writerow(video)
print(f"Saved {len(videos)} videos to {filename}")
def analyze_content_themes(videos):
import re
all_hashtags = []
for video in videos:
desc = video.get("description", "")
tags = re.findall(r"#(\w+)", desc)
all_hashtags.extend(tags)
return Counter(all_hashtags).most_common(20)
Handling Anti-Bot Measures
TikTok requires sophisticated proxy rotation. ScraperAPI provides JavaScript rendering with automatic proxy rotation. For residential IPs, ThorData is a solid choice.
Monitor your TikTok scraper's health with ScrapeOps — TikTok changes its defenses frequently, so you'll want immediate alerts when your scraper breaks.
Ethical Considerations
- Only scrape publicly available content
- Never scrape private accounts or DMs
- Respect rate limits
- Don't use scraped data for harassment
- Consider using TikTok's official Research API if you qualify
- Comply with GDPR and CCPA
Conclusion
TikTok scraping is technically challenging but possible with the right tools. Use browser automation for reliability, rotate proxies for sustainability, and always respect both the platform and its users' privacy.
Top comments (0)