Are you trying to extract book data from Goodreads? Whether you're building a recommendation engine, analyzing reading trends, or automating your reading list — Goodreads is one of the richest sources of book metadata on the web.
In this guide, I'll show you how to scrape Goodreads in 2026 for books, reviews, author profiles, and ratings using Python.
Why Scrape Goodreads?
Goodreads hosts data on over 4 billion books with reviews, ratings, genres, and author profiles. Common use cases include:
- Book recommendation systems — build datasets of ratings and reviews
- Author analytics — track an author's catalog, average ratings, and review volume
- Market research — analyze trends in genres, publishing dates, and reader sentiment
- Reading list automation — programmatically extract shelves and lists
Setting Up Your Environment
import requests
from bs4 import BeautifulSoup
import json
import time
import random
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
}
Install the dependencies:
pip install requests beautifulsoup4
Scraping Book Details
Each Goodreads book page contains the title, author, rating, number of reviews, genres, description, and publication info.
def scrape_book(book_url):
"""Scrape details from a Goodreads book page."""
response = requests.get(book_url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
# Extract JSON-LD structured data
script = soup.find("script", {"type": "application/ld+json"})
if script:
data = json.loads(script.string)
return {
"title": data.get("name"),
"author": data.get("author", [{}])[0].get("name"),
"rating": data.get("aggregateRating", {}).get("ratingValue"),
"review_count": data.get("aggregateRating", {}).get("reviewCount"),
"isbn": data.get("isbn"),
"page_count": data.get("numberOfPages"),
"description": data.get("description", "")[:200],
}
return None
# Usage
book = scrape_book("https://www.goodreads.com/book/show/5907.The_Hobbit")
print(json.dumps(book, indent=2))
Pro tip: Goodreads embeds rich JSON-LD data in most book pages. Parsing structured data is far more reliable than scraping individual HTML elements.
Scraping Author Profiles
Author pages contain their bio, book count, average rating, and full bibliography.
def scrape_author(author_url):
"""Scrape an author profile from Goodreads."""
response = requests.get(author_url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
name_tag = soup.find("h1", class_="authorName")
name = name_tag.get_text(strip=True) if name_tag else None
# Extract book list from author page
books = []
for book_row in soup.select("tr[itemtype='http://schema.org/Book']"):
title_tag = book_row.select_one("a.bookTitle")
rating_tag = book_row.select_one("span.minirating")
if title_tag:
books.append({
"title": title_tag.get_text(strip=True),
"url": "https://www.goodreads.com" + title_tag["href"],
"rating_info": rating_tag.get_text(strip=True) if rating_tag else None,
})
return {"name": name, "books": books[:10]}
author = scrape_author("https://www.goodreads.com/author/show/656983.J_R_R_Tolkien")
print(f"{author['name']} - {len(author['books'])} books scraped")
Mining Reviews
Reviews are where the real value is. Sentiment analysis on book reviews can reveal patterns that ratings alone miss.
def scrape_reviews(book_id, page=1):
"""Scrape reviews for a book from Goodreads."""
url = f"https://www.goodreads.com/book/show/{book_id}"
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
reviews = []
for review_card in soup.select("[data-testid='reviewCard']"):
stars = review_card.select("svg.RatingStar__star--filled")
text_el = review_card.select_one("[data-testid='contentContainer']")
reviewer = review_card.select_one("a[href*='/user/show/']")
reviews.append({
"rating": len(stars) if stars else None,
"text": text_el.get_text(strip=True)[:500] if text_el else "",
"reviewer": reviewer.get_text(strip=True) if reviewer else "Anonymous",
})
return reviews
reviews = scrape_reviews("5907")
for r in reviews[:3]:
print(f"Rating: {r['rating']} - {r['text'][:80]}...")
Scraping Book Lists and Shelves
Goodreads lists are goldmines for curated book datasets:
def scrape_list(list_url, max_pages=3):
"""Scrape books from a Goodreads list."""
all_books = []
for page in range(1, max_pages + 1):
url = f"{list_url}?page={page}"
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.select("tr[itemtype='http://schema.org/Book']"):
title = item.select_one("a.bookTitle")
author = item.select_one("a.authorName")
score = item.select_one("span.minirating")
if title:
all_books.append({
"title": title.get_text(strip=True),
"author": author.get_text(strip=True) if author else None,
"rating": score.get_text(strip=True) if score else None,
})
time.sleep(random.uniform(1.5, 3.0)) # Be respectful
return all_books
books = scrape_list("https://www.goodreads.com/list/show/1.Best_Books_Ever")
print(f"Scraped {len(books)} books from list")
Handling Anti-Scraping Measures
Goodreads uses several protections. Here is how to handle them responsibly:
- Rate limiting — Add delays between requests (1-3 seconds minimum)
- User-Agent rotation — Rotate browser User-Agent strings
- IP blocking — Use a residential proxy service
For proxy rotation, I recommend ScrapeOps for its proxy aggregator that automatically rotates through providers, or ThorData for reliable residential proxies with good Goodreads success rates.
def get_with_proxy(url, proxy_url):
"""Make a request through a proxy."""
proxies = {"http": proxy_url, "https": proxy_url}
return requests.get(url, headers=HEADERS, proxies=proxies, timeout=30)
The Easier Way: Use a Pre-Built Scraper
If you don't want to maintain scraping code yourself, the Goodreads Scraper on Apify handles all of this out of the box — anti-bot bypassing, pagination, structured JSON output, and scheduled runs.
It supports scraping:
- Book details (title, author, ISBN, rating, description)
- Author profiles and bibliographies
- Reviews with ratings and text
- Book lists and shelves
- Search results by keyword
You get clean JSON output ready for your database or analysis pipeline.
Storing Your Data
Once scraped, store the data in a structured format:
import csv
def save_to_csv(books, filename="goodreads_books.csv"):
"""Save scraped books to CSV."""
if not books:
return
keys = books[0].keys()
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(books)
print(f"Saved {len(books)} books to {filename}")
Legal and Ethical Considerations
- Respect
robots.txt— Goodreads blocks some paths - Do not scrape private user data or profiles without consent
- Add reasonable delays between requests
- Cache responses to avoid hitting the same page twice
- Use the data for analysis and research, not republishing copyrighted reviews
Conclusion
Goodreads is an excellent source of book data for recommendation engines, market research, and reading analytics. Whether you build a custom scraper with Python and BeautifulSoup, or use the Goodreads Scraper on Apify for a managed solution, you can extract structured book data at scale.
For production scraping, pair your setup with a proxy service like ScrapeOps or ThorData to handle IP rotation and avoid blocks.
Happy scraping!
Top comments (0)