Goodreads has the largest book database on the web — over 3 billion books catalogued with ratings, reviews, and reader metadata. Extracting this data lets you build recommendation engines, reading trackers, and literary analysis tools.
What Book Data Can You Extract?
- Book titles, authors, ISBNs, and descriptions
- Average ratings and rating distributions
- Review text and review counts
- Genre/shelf categorization
- Author profiles and bibliographies
- Similar book recommendations
Setting Up the Scraper
import requests
from bs4 import BeautifulSoup
import json
import time
import re
class GoodreadsScraper:
BASE_URL = "https://www.goodreads.com"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
'Accept': 'text/html,application/xhtml+xml',
})
def get_book_details(self, book_url):
"""Extract detailed book information."""
resp = self.session.get(book_url)
soup = BeautifulSoup(resp.text, 'html.parser')
# Extract JSON-LD structured data
script = soup.find('script', type='application/ld+json')
if script:
ld_data = json.loads(script.string)
else:
ld_data = {}
title = soup.select_one('h1[data-testid="bookTitle"]')
author = soup.select_one('.ContributorLink__name')
rating = soup.select_one('.RatingStatistics__rating')
ratings_count = soup.select_one('[data-testid="ratingsCount"]')
description = soup.select_one('[data-testid="description"]')
return {
'title': title.get_text(strip=True) if title else '',
'author': author.get_text(strip=True) if author else '',
'rating': float(rating.get_text(strip=True)) if rating else 0,
'ratings_count': ratings_count.get_text(strip=True) if ratings_count else '',
'description': description.get_text(strip=True) if description else '',
'isbn': ld_data.get('isbn', ''),
'pages': ld_data.get('numberOfPages', ''),
}
def search_books(self, query, pages=1):
"""Search for books by title, author, or ISBN."""
results = []
for page in range(1, pages + 1):
url = f"{self.BASE_URL}/search"
params = {'q': query, 'page': page}
resp = self.session.get(url, params=params)
soup = BeautifulSoup(resp.text, 'html.parser')
book_rows = soup.select('tr[itemtype="http://schema.org/Book"]')
for row in book_rows:
title_el = row.select_one('a.bookTitle')
author_el = row.select_one('a.authorName')
rating_el = row.select_one('.minirating')
if title_el:
results.append({
'title': title_el.get_text(strip=True),
'url': self.BASE_URL + title_el['href'],
'author': author_el.get_text(strip=True) if author_el else '',
'rating_info': rating_el.get_text(strip=True) if rating_el else '',
})
time.sleep(2)
return results
Building a Reading Tracker
import pandas as pd
from datetime import datetime
class ReadingTracker:
def __init__(self, scraper, db_file='reading_list.csv'):
self.scraper = scraper
self.db_file = db_file
def add_book(self, query):
"""Search and add a book to tracking list."""
results = self.scraper.search_books(query, pages=1)
if not results:
print(f"No books found for '{query}'")
return None
book = results[0]
details = self.scraper.get_book_details(book['url'])
record = {
'added_date': datetime.now().isoformat(),
'title': details['title'],
'author': details['author'],
'rating': details['rating'],
'pages': details['pages'],
'status': 'to_read',
'url': book['url'],
}
df = pd.DataFrame([record])
df.to_csv(self.db_file, mode='a', header=False, index=False)
print(f"Added: {details['title']} by {details['author']} ({details['rating']}⭐)")
return record
def get_recommendations(self, min_rating=4.0, genre=None):
"""Get book recommendations based on reading history."""
df = pd.read_csv(self.db_file,
names=['added_date', 'title', 'author', 'rating',
'pages', 'status', 'url'])
# Find highly rated authors in your list
top_authors = df[df.rating >= min_rating]['author'].unique()
print(f"Your top-rated authors: {', '.join(top_authors[:5])}")
# Search for more books by those authors
recommendations = []
for author in top_authors[:3]:
results = self.scraper.search_books(author, pages=1)
for book in results:
if book['title'] not in df['title'].values:
recommendations.append(book)
time.sleep(2)
return recommendations[:10]
Building a Simple Recommendation Engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def build_recommendation_engine(books_data):
"""Build content-based recommendations from book descriptions."""
df = pd.DataFrame(books_data)
# Create TF-IDF vectors from descriptions
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['description'].fillna(''))
# Compute similarity matrix
similarity = cosine_similarity(tfidf_matrix)
def get_similar_books(title, top_n=5):
idx = df[df['title'] == title].index
if len(idx) == 0:
return []
idx = idx[0]
scores = list(enumerate(similarity[idx]))
scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
return [(df.iloc[i]['title'], score) for i, score in scores]
return get_similar_books
# Usage
recommender = build_recommendation_engine(books_data)
similar = recommender("Project Hail Mary")
for title, score in similar:
print(f" {title} (similarity: {score:.2f})")
Scaling with Cloud Solutions
For large-scale book data collection — entire genres, bestseller lists, or author catalogs — the Goodreads Scraper on Apify handles the heavy lifting, managing sessions and anti-bot detection automatically.
For reliable proxy rotation during data collection, ScraperAPI simplifies the proxy management with a single API endpoint.
Conclusion
Goodreads data opens up powerful applications — from personal reading trackers to recommendation engines to literary trend analysis. The combination of structured book metadata and rich review text makes it ideal for both simple tracking tools and sophisticated ML-powered recommendations. Start with manual scraping for your personal library, then scale to cloud scrapers when you need broader coverage.
Top comments (0)