IMDB is the world's most comprehensive movie database with data on millions of titles. Whether you're building a recommendation engine, analyzing box office trends, or creating a movie app, you need IMDB data. Let's compare scraping vs the official API and build working examples of both.
IMDB Data Sources in 2026
1. IMDB Datasets (Free, Official)
IMDB offers free TSV datasets at datasets.imdb.com with basic title info, ratings, names, and crew data. Updated daily.
2. IMDB API (Paid)
The official IMDB API (via AWS Data Exchange) provides structured data but requires a paid subscription.
3. Web Scraping (Free, Unofficial)
Scraping IMDB directly gives you the richest data but requires maintenance.
Approach 1: IMDB Free Datasets
import pandas as pd
import gzip
import urllib.request
def download_imdb_dataset(dataset_name):
"""Download and parse an IMDB dataset."""
url = f"https://datasets.imdb.com/{dataset_name}.tsv.gz"
print(f"Downloading {dataset_name}...")
filepath, _ = urllib.request.urlretrieve(url, f"/tmp/{dataset_name}.tsv.gz")
print("Parsing...")
df = pd.read_csv(filepath, sep="\t", na_values="\\N", low_memory=False)
print(f"Loaded {len(df)} records")
return df
# Download key datasets
titles = download_imdb_dataset("title.basics")
ratings = download_imdb_dataset("title.ratings")
# Merge titles with ratings
movies = titles[titles["titleType"] == "movie"].merge(
ratings, on="tconst", how="inner"
)
# Top rated movies (min 50k votes)
top_movies = movies[movies["numVotes"] >= 50000].nlargest(20, "averageRating")
print(top_movies[["primaryTitle", "startYear", "averageRating", "numVotes"]])
Approach 2: Web Scraping for Rich Data
The datasets lack reviews, box office data, and detailed cast info. Scraping fills those gaps:
import requests
from bs4 import BeautifulSoup
import json
import time
def scrape_imdb_movie(imdb_id):
"""Scrape detailed movie info from IMDB."""
url = f"https://www.imdb.com/title/{imdb_id}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
movie = {}
# Extract JSON-LD structured data (most reliable)
script = soup.find("script", type="application/ld+json")
if script:
ld_data = json.loads(script.string)
movie["title"] = ld_data.get("name")
movie["description"] = ld_data.get("description")
movie["rating"] = ld_data.get("aggregateRating", {}).get("ratingValue")
movie["vote_count"] = ld_data.get("aggregateRating", {}).get("ratingCount")
movie["genres"] = ld_data.get("genre", [])
movie["director"] = ld_data.get("director", [{}])
movie["duration"] = ld_data.get("duration")
movie["date_published"] = ld_data.get("datePublished")
movie["content_rating"] = ld_data.get("contentRating")
# Box office data
box_office_section = soup.find("div", {"data-testid": "title-boxoffice-section"})
if box_office_section:
items = box_office_section.find_all("li")
for item in items:
label = item.find("span", class_="ipc-metadata-list-item__label")
value = item.find("span", class_="ipc-metadata-list-item__list-content-item")
if label and value:
movie[label.get_text(strip=True).lower().replace(" ", "_")] = value.get_text(strip=True)
return movie
# Example usage
movie = scrape_imdb_movie("tt1375666") # Inception
print(json.dumps(movie, indent=2))
Scraping Top 250 Movies
def scrape_imdb_top250():
"""Scrape the IMDB Top 250 list."""
url = "https://www.imdb.com/chart/top/"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Find JSON-LD data
script = soup.find("script", type="application/ld+json")
if script:
data = json.loads(script.string)
items = data.get("itemListElement", [])
movies = []
for item in items:
movie = item.get("item", {})
movies.append({
"rank": item.get("position"),
"title": movie.get("name"),
"url": movie.get("url"),
"rating": movie.get("aggregateRating", {}).get("ratingValue"),
"votes": movie.get("aggregateRating", {}).get("ratingCount"),
"description": movie.get("description"),
})
return movies
return []
top250 = scrape_imdb_top250()
for m in top250[:10]:
print(f"#{m['rank']} {m['title']} — {m['rating']}/10 ({m['votes']} votes)")
Extracting Reviews
def scrape_imdb_reviews(imdb_id, max_reviews=25):
"""Scrape user reviews for a movie."""
url = f"https://www.imdb.com/title/{imdb_id}/reviews/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
reviews = []
review_containers = soup.find_all("div", class_="review-container")
for container in review_containers[:max_reviews]:
review = {}
title_el = container.find("a", class_="title")
review["title"] = title_el.get_text(strip=True) if title_el else None
rating_el = container.find("span", class_="rating-other-user-rating")
if rating_el:
review["rating"] = rating_el.find("span").get_text(strip=True)
content_el = container.find("div", class_="text")
review["content"] = content_el.get_text(strip=True) if content_el else None
date_el = container.find("span", class_="review-date")
review["date"] = date_el.get_text(strip=True) if date_el else None
reviews.append(review)
return reviews
Pre-Built Alternative
For production IMDB data extraction without maintaining scrapers, check out the IMDB Scraper on Apify. It handles anti-bot measures, pagination, and outputs structured JSON ready for analysis.
Comparison Table: Datasets vs Scraping vs API
| Feature | Free Datasets | Web Scraping | Official API |
|---|---|---|---|
| Cost | Free | Free + proxy costs | Paid subscription |
| Data freshness | Daily updates | Real-time | Real-time |
| Reviews | No | Yes | Yes |
| Box office | No | Yes | Yes |
| Cast photos | No | Yes | Yes |
| Rate limits | None | Aggressive | Quota-based |
| Maintenance | None | High | Low |
| Legal risk | None | Gray area | None |
Proxy Management
IMDB actively blocks scraping bots. For reliable access, use residential proxies from ThorData which provides rotating IPs that avoid detection.
Conclusion
For most projects, start with IMDB's free datasets for bulk data. Add web scraping for reviews, box office, and details not in the datasets. Use the official API only if your budget supports it and you need guaranteed uptime.
Top comments (0)