IMDb remains the go-to database for movie and TV information. With over 10 million titles and hundreds of millions of ratings, it's a goldmine for data projects — recommendation engines, market analysis, sentiment tracking, and more.
In this guide, I'll show you how to scrape IMDb effectively in 2026 with working Python code.
IMDb's Structure in 2026
IMDb is still primarily server-rendered HTML, which makes it easier to scrape than heavily JavaScript-dependent sites. However, they've added more dynamic loading and anti-bot protections over the years.
Key pages you'll want to scrape:
-
Title pages (
/title/tt1234567/) — movie/show details, ratings, cast -
Search results (
/find/) — finding titles by name -
Charts (
/chart/) — top rated, most popular -
Reviews (
/title/tt1234567/reviews) — user reviews and ratings -
Name pages (
/name/nm1234567/) — actor/director filmography
Basic Scraping with BeautifulSoup
Let's start with extracting movie details from a title page:
import requests
from bs4 import BeautifulSoup
import json
def scrape_imdb_title(title_id: str) -> dict:
url = f"https://www.imdb.com/title/{title_id}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# IMDb embeds structured data as JSON-LD
json_ld = soup.find("script", {"type": "application/ld+json"})
if json_ld:
data = json.loads(json_ld.string)
return {
"title": data.get("name"),
"type": data.get("@type"),
"year": data.get("datePublished", "")[:4],
"rating": data.get("aggregateRating", {}).get("ratingValue"),
"rating_count": data.get("aggregateRating", {}).get("ratingCount"),
"genres": data.get("genre", []),
"description": data.get("description"),
"directors": [
d["name"] for d in (data.get("director", []) if isinstance(data.get("director"), list) else [data.get("director", {})])
if isinstance(d, dict)
],
"actors": [
a["name"] for a in data.get("actor", [])[:5]
if isinstance(a, dict)
],
"duration": data.get("duration"),
"content_rating": data.get("contentRating")
}
return {}
# Example: Scrape The Shawshank Redemption
movie = scrape_imdb_title("tt0111161")
print(json.dumps(movie, indent=2))
The JSON-LD approach is robust because IMDb uses it for SEO — they're unlikely to remove it.
Scraping IMDb Search Results
To find movies by keyword or title:
def search_imdb(query: str, max_results: int = 10) -> list:
url = "https://www.imdb.com/find/"
params = {
"q": query,
"s": "tt", # Search titles only
"exact": "true"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, params=params, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
results = []
# IMDb's search results use a specific section structure
for item in soup.select(".ipc-metadata-list-summary-item")[:max_results]:
link = item.select_one("a[href*=/title/]")
if link:
href = link["href"]
title_id = href.split("/title/")[1].split("/")[0]
title_text = link.get_text(strip=True)
year_el = item.select_one(".ipc-metadata-list-summary-item__li")
year = year_el.get_text(strip=True) if year_el else None
results.append({
"title_id": title_id,
"title": title_text,
"year": year,
"url": f"https://www.imdb.com/title/{title_id}/"
})
return results
results = search_imdb("Inception")
for r in results:
print(f"{r["title"]} ({r["year"]}) — {r["url"]}")
Extracting Ratings and Reviews
User reviews are valuable for sentiment analysis:
import time
import random
def scrape_imdb_reviews(title_id: str, max_reviews: int = 25) -> list:
url = f"https://www.imdb.com/title/{title_id}/reviews"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
reviews = []
for review in soup.select(".review-container")[:max_reviews]:
title_el = review.select_one(".title")
content_el = review.select_one(".text")
rating_el = review.select_one(".rating-other-user-rating span")
date_el = review.select_one(".review-date")
helpful_el = review.select_one(".actions")
reviews.append({
"review_title": title_el.get_text(strip=True) if title_el else None,
"content": content_el.get_text(strip=True) if content_el else None,
"rating": int(rating_el.get_text(strip=True)) if rating_el else None,
"date": date_el.get_text(strip=True) if date_el else None,
"helpful": helpful_el.get_text(strip=True) if helpful_el else None
})
return reviews
reviews = scrape_imdb_reviews("tt0111161")
for r in reviews[:3]:
print(f"★{r["rating"]}/10 — {r["review_title"]}")
print(f" {r["content"][:100]}...")
print()
Scraping Top Charts
IMDb's charts (Top 250, Most Popular) are great for building recommendation datasets:
def scrape_top_250() -> list:
url = "https://www.imdb.com/chart/top/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Extract from JSON-LD structured data
json_ld = soup.find("script", {"type": "application/ld+json"})
if json_ld:
data = json.loads(json_ld.string)
movies = []
for item in data.get("itemListElement", []):
movie = item.get("item", {})
movies.append({
"rank": item.get("position"),
"title": movie.get("name"),
"url": movie.get("url"),
"rating": movie.get("aggregateRating", {}).get("ratingValue"),
"rating_count": movie.get("aggregateRating", {}).get("ratingCount"),
"description": movie.get("description")
})
return movies
return []
top_movies = scrape_top_250()
for m in top_movies[:10]:
print(f"#{m["rank"]} {m["title"]} — ★{m["rating"]}")
Handling Anti-Bot Protection
IMDb has gotten stricter about automated access. Here's what you need:
Proxy Rotation
Scraping IMDb at scale without proxies will get your IP blocked. I recommend ScrapeOps — they provide a proxy API specifically optimized for web scraping. You just route your requests through their endpoint:
def get_with_scrapeops(url: str, api_key: str) -> str:
response = requests.get(
"https://proxy.scrapeops.io/v1/",
params={
"api_key": api_key,
"url": url,
"render_js": "false"
}
)
return response.text
html = get_with_scrapeops(
"https://www.imdb.com/title/tt0111161/",
"YOUR_SCRAPEOPS_KEY"
)
For higher volume scraping, ThorData residential proxies give you a rotating pool of real residential IPs:
proxies = {
"http": "http://user:pass@proxy.thordata.com:9090",
"https": "http://user:pass@proxy.thordata.com:9090"
}
response = requests.get(
"https://www.imdb.com/chart/top/",
headers=headers,
proxies=proxies
)
Rate Limiting
Always add delays between requests:
import time
import random
def polite_request(url: str, headers: dict) -> requests.Response:
time.sleep(random.uniform(2, 5)) # Random delay
return requests.get(url, headers=headers)
Building a Complete Movie Dataset
Here's how to combine everything into a dataset builder:
import csv
import time
import random
def build_movie_dataset(title_ids: list, output_file: str):
fieldnames = [
"title_id", "title", "year", "rating", "rating_count",
"genres", "directors", "actors", "duration", "content_rating"
]
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for title_id in title_ids:
try:
movie = scrape_imdb_title(title_id)
if movie:
movie["title_id"] = title_id
movie["genres"] = ", ".join(movie.get("genres", []))
movie["directors"] = ", ".join(movie.get("directors", []))
movie["actors"] = ", ".join(movie.get("actors", []))
writer.writerow(movie)
print(f"✓ {movie.get("title", title_id)}")
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f"✗ {title_id}: {e}")
continue
# Build a dataset from Top 250
top_movies = scrape_top_250()
title_ids = [m["url"].split("/title/")[1].rstrip("/") for m in top_movies[:50]]
build_movie_dataset(title_ids, "imdb_top50.csv")
The Easy Way: Pre-Built IMDb Scraper
Maintaining an IMDb scraper means keeping up with their HTML changes. If you need reliable data extraction without the maintenance, check out the IMDb Scraper on Apify. It handles all the parsing, proxy rotation, and anti-bot detection automatically.
from apify_client import ApifyClient
client = ApifyClient("YOUR_APIFY_TOKEN")
run = client.actor("cryptosignals/imdb-scraper").call(
run_input={
"searchTerms": ["inception", "interstellar"],
"maxItems": 100,
"includeReviews": True
}
)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
print(f"{item["title"]} ({item["year"]}) — ★{item["rating"]}")
It returns clean JSON with all the fields you need — no HTML parsing required.
Legal Considerations
A few things to keep in mind:
- Respect robots.txt — IMDb's robots.txt restricts certain paths. Check it before scraping.
- Rate limit your requests — don't hammer their servers. 1 request per 2-5 seconds is reasonable.
- Don't redistribute copyrighted content — movie descriptions and reviews have copyright protections.
- Use data for analysis, not replication — building a competing movie database from scraped data could be problematic.
- IMDb has an official API — for commercial use, consider their data licensing options.
Conclusion
IMDb is one of the more scraper-friendly sites thanks to its server-rendered HTML and rich JSON-LD data. The BeautifulSoup examples above should handle most use cases.
For production scraping, use ScrapeOps or ThorData proxies to avoid IP blocks. And if you want to skip the code entirely, the IMDb Scraper on Apify does everything out of the box.
Happy scraping!
Top comments (0)