How to Scrape IMDb: Movies, Ratings, and Box Office Data
IMDb is the definitive database for movies, TV shows, and entertainment data. Whether you are building a recommendation engine, analyzing box office trends, or researching film history, IMDb data is invaluable.
What Data Is Available?
- Movie details: title, year, runtime, genre, plot
- Ratings: IMDb score, number of votes, Metascore
- Box office: domestic and worldwide gross
- Cast and crew: directors, writers, actors
- Release info: dates, countries, languages
Setup
pip install requests beautifulsoup4 pandas cinemagoer
Method 1: Using Cinemagoer (IMDbPY)
The official Python package for accessing IMDb data:
from imdb import Cinemagoer
ia = Cinemagoer()
def get_movie_details(title):
results = ia.search_movie(title)
if not results:
return None
movie = results[0]
ia.update(movie, info=["main", "vote details", "box office"])
return {
"title": movie.get("title"),
"year": movie.get("year"),
"rating": movie.get("rating"),
"votes": movie.get("votes"),
"genres": movie.get("genres", []),
"runtime": movie.get("runtimes", ["N/A"])[0],
"directors": [d.get("name") for d in movie.get("directors", [])],
"cast": [a.get("name") for a in movie.get("cast", [])[:5]],
"plot": movie.get("plot outline", ""),
"box_office": movie.get("box office", {})
}
movie = get_movie_details("The Shawshank Redemption")
print(f"{movie['title']} ({movie['year']})")
print(f"Rating: {movie['rating']}/10 ({movie['votes']:,} votes)")
print(f"Genres: {', '.join(movie['genres'])}")
Method 2: Direct HTML Scraping
For data not available through Cinemagoer:
import requests
from bs4 import BeautifulSoup
import re
import json
def scrape_imdb_top250():
url = "https://www.imdb.com/chart/top/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
movies = []
script = soup.find("script", {"type": "application/ld+json"})
if script:
data = json.loads(script.string)
items = data.get("itemListElement", [])
for item in items:
movie_data = item.get("item", {})
movies.append({
"rank": item.get("position"),
"title": movie_data.get("name"),
"url": movie_data.get("url"),
"rating": movie_data.get("aggregateRating", {}).get("ratingValue"),
"votes": movie_data.get("aggregateRating", {}).get("ratingCount"),
"description": movie_data.get("description", "")
})
return movies
top250 = scrape_imdb_top250()
for m in top250[:10]:
print(f"#{m['rank']}: {m['title']} - {m['rating']}/10")
Box Office Data
def scrape_box_office(imdb_id):
url = f"https://www.imdb.com/title/{imdb_id}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
box_office = {}
bo_section = soup.find("div", {"data-testid": "title-boxoffice-section"})
if bo_section:
items = bo_section.find_all("li")
for item in items:
label = item.find("span", class_=re.compile("label"))
value = item.find("span", class_=re.compile("value"))
if label and value:
box_office[label.get_text(strip=True)] = value.get_text(strip=True)
return box_office
bo = scrape_box_office("tt0111161")
for key, value in bo.items():
print(f"{key}: {value}")
Bulk Movie Data Collection
import pandas as pd
import time
def collect_genre_data(genre, count=50):
ia = Cinemagoer()
results = ia.get_top50_movies_by_genres(genre)
movies = []
for movie in results[:count]:
ia.update(movie)
movies.append({
"title": movie.get("title"),
"year": movie.get("year"),
"rating": movie.get("rating"),
"votes": movie.get("votes", 0),
"genres": ", ".join(movie.get("genres", [])),
"runtime": movie.get("runtimes", ["0"])[0]
})
time.sleep(0.5)
return pd.DataFrame(movies)
action_movies = collect_genre_data("action")
print(f"Average action movie rating: {action_movies['rating'].mean():.1f}")
Handling IMDb Anti-Scraping
IMDb blocks aggressive scraping. For reliable access at scale, use ScraperAPI which renders JavaScript and handles CAPTCHAs:
def imdb_via_proxy(url):
params = {
"api_key": "YOUR_SCRAPERAPI_KEY",
"url": url,
"render": "true"
}
return requests.get("https://api.scraperapi.com", params=params)
ThorData residential proxies help when accessing localized IMDb pages for regional box office data. ScrapeOps monitors your scraper performance and alerts on blocked requests.
IMDb Datasets (Official)
IMDb provides free downloadable datasets at datasets.imdbws.com:
def load_imdb_dataset(dataset_name):
url = f"https://datasets.imdb.com/{dataset_name}.tsv.gz"
df = pd.read_csv(url, sep="\t", na_values="\\N", compression="gzip")
return df
ratings = load_imdb_dataset("title.ratings")
print(f"Total rated titles: {len(ratings):,}")
top = ratings[ratings["numVotes"] >= 1000].nlargest(10, "averageRating")
print(top[["tconst", "averageRating", "numVotes"]].to_string(index=False))
Tips
- Use IMDb Datasets for bulk analysis instead of scraping
- Cinemagoer is your friend — it handles most common use cases
- Cache movie details — they rarely change
- Respect rate limits — IMDb will block your IP
- Check the robots.txt — IMDb specifies allowed crawl paths
Follow for more Python data collection and analysis tutorials!
Top comments (0)