Goodreads shut down its public API in December 2020, leaving developers with no official way to access book data programmatically. But the data is still there on every public book page — ratings, reviews, genres, author info — all rendered in HTML that's straightforward to parse.
In this guide, I'll show you how to scrape Goodreads book pages using Python and BeautifulSoup to extract the data you need for book recommendation systems, reading analytics, and publisher research.
Why Scrape Goodreads?
Goodreads remains the largest book review platform with over 150 million members and 2.5 billion books shelved. It's the single best source for:
- Book ratings and reviews — aggregated from millions of readers
- Genre classification — community-driven shelving data
- Author profiles — bibliographies, follower counts, influences
- Reading trends — what's popular, what's being discussed
Since the API shutdown, web scraping is the only programmatic option. Let's build it.
Understanding Goodreads Page Structure
Goodreads book pages follow a consistent HTML structure. Here's what you'll find:
- Title and author — in structured heading elements
- Rating — the average rating and total count
- Genres — listed as shelf/genre links
- Description — the book blurb
- Publication info — publisher, date, ISBN, page count
- Similar books — recommendations
The key insight: most of this data lives in well-structured HTML elements and JSON-LD schema markup, making extraction reliable.
1. Searching for Books
Goodreads search works via a simple URL pattern:
import requests
from bs4 import BeautifulSoup
def search_books(query):
url = "https://www.goodreads.com/search"
params = {"q": query, "search_type": "books"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, params=params, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
results = []
for row in soup.select("tr[itemtype='http://schema.org/Book']"):
title_el = row.select_one("a.bookTitle")
author_el = row.select_one("a.authorName")
rating_el = row.select_one("span.minirating")
if title_el and author_el:
results.append({
"title": title_el.get_text(strip=True),
"author": author_el.get_text(strip=True),
"url": "https://www.goodreads.com" + title_el["href"],
"rating": rating_el.get_text(strip=True) if rating_el else None
})
return results
books = search_books("atomic habits")
for book in books[:5]:
print(f"{book['title']} by {book['author']} - {book['rating']}")
2. Extracting Book Details
Once you have a book URL, you can extract rich metadata:
import requests
from bs4 import BeautifulSoup
import json
def get_book_details(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Extract JSON-LD schema data (most reliable)
schema = soup.find("script", type="application/ld+json")
if schema:
data = json.loads(schema.string)
# Extract from HTML elements
title = soup.select_one("h1[data-testid='bookTitle']")
author = soup.select_one("span[data-testid='name']")
rating = soup.select_one("div.RatingStatistics__rating")
rating_count = soup.select_one("span[data-testid='ratingsCount']")
description = soup.select_one("div[data-testid='description']")
# Extract genres from shelf buttons
genres = []
genre_elements = soup.select("span.BookPageMetadataSection__genreButton a")
for g in genre_elements:
genres.append(g.get_text(strip=True))
# Extract page count and publication info
pages_el = soup.select_one("p[data-testid='pagesFormat']")
pub_el = soup.select_one("p[data-testid='publicationInfo']")
return {
"title": title.get_text(strip=True) if title else None,
"author": author.get_text(strip=True) if author else None,
"rating": rating.get_text(strip=True) if rating else None,
"rating_count": rating_count.get_text(strip=True) if rating_count else None,
"description": description.get_text(strip=True)[:300] if description else None,
"genres": genres,
"pages": pages_el.get_text(strip=True) if pages_el else None,
"published": pub_el.get_text(strip=True) if pub_el else None,
}
book = get_book_details("https://www.goodreads.com/book/show/40121378-atomic-habits")
for key, value in book.items():
print(f"{key}: {value}")
3. Getting Author Profiles
Author pages contain bibliographies and follower data:
import requests
from bs4 import BeautifulSoup
def get_author_info(author_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(author_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
name = soup.select_one("h1.authorName")
bio = soup.select_one("div.aboutAuthorInfo")
followers = soup.select_one("div.followCount")
# Get book list
books = []
for book_row in soup.select("tr[itemtype='http://schema.org/Book']")[:10]:
title_el = book_row.select_one("a.bookTitle")
rating_el = book_row.select_one("span.minirating")
if title_el:
books.append({
"title": title_el.get_text(strip=True),
"rating": rating_el.get_text(strip=True) if rating_el else None
})
return {
"name": name.get_text(strip=True) if name else None,
"bio": bio.get_text(strip=True)[:300] if bio else None,
"followers": followers.get_text(strip=True) if followers else None,
"books": books
}
author = get_author_info("https://www.goodreads.com/author/show/7327.James_Clear")
print(f"{author['name']} ({author['followers']})")
for b in author["books"]:
print(f" - {b['title']} {b['rating']}")
Building a Book Research Pipeline
Here's how to combine everything into a research workflow:
import requests
from bs4 import BeautifulSoup
import time
import csv
def research_genre(genre, max_books=20):
url = f"https://www.goodreads.com/shelf/show/{genre}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
books = []
for el in soup.select("div.leftAlignedImage")[:max_books]:
link = el.select_one("a.leftAlignedImage")
title = el.select_one("a[title]")
if link and title:
books.append({
"title": title.get("title", ""),
"url": "https://www.goodreads.com" + link["href"]
})
# Enrich with details
for book in books:
time.sleep(2) # Be respectful
details = get_book_details(book["url"])
book.update(details)
return books
# Research science fiction books
books = research_genre("science-fiction", max_books=5)
for b in books:
print(f"{b.get('title', 'N/A')}: {b.get('rating', 'N/A')} - {', '.join(b.get('genres', []))}")
Best Practices for Goodreads Scraping
Since there's no API, you need to be extra careful:
- Respect robots.txt — check which paths are allowed
- Add 2-3 second delays between requests
- Use proper User-Agent headers — don't use default Python UA
- Cache aggressively — book data changes slowly
- Handle rate limiting — back off on 429 responses
- Monitor HTML changes — selectors can break with redesigns
When DIY Isn't Enough
Building your own scraper works great for small projects. But for production use cases — thousands of books, daily updates, reliable data pipelines — you'll spend more time maintaining the scraper than using the data.
For production workloads, Goodreads Scraper on Apify handles the complexity: proxy rotation, HTML change detection, structured output, and automatic retries.
Use Cases Worth Building
With Goodreads data, you can create:
- Personalized book recommendations based on genre and rating patterns
- Reading list analyzers that show your taste profile
- Publisher research tools tracking what's trending
- Book club selectors with rating and length filters
- Genre trend dashboards showing what's rising or falling
The Goodreads API may be gone, but the data is still there. With careful scraping and respect for their servers, you can build powerful book data applications.
Top comments (0)