Goodreads Book Data Extraction: Build a Reading Tracker or Recommendation Engine

#webdev #python #webscraping #tutorial

Goodreads has the largest book database on the web — over 3 billion books catalogued with ratings, reviews, and reader metadata. Extracting this data lets you build recommendation engines, reading trackers, and literary analysis tools.

What Book Data Can You Extract?

Book titles, authors, ISBNs, and descriptions
Average ratings and rating distributions
Review text and review counts
Genre/shelf categorization
Author profiles and bibliographies
Similar book recommendations

Setting Up the Scraper

import requests
from bs4 import BeautifulSoup
import json
import time
import re

class GoodreadsScraper:
    BASE_URL = "https://www.goodreads.com"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
            'Accept': 'text/html,application/xhtml+xml',
        })

    def get_book_details(self, book_url):
        """Extract detailed book information."""
        resp = self.session.get(book_url)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Extract JSON-LD structured data
        script = soup.find('script', type='application/ld+json')
        if script:
            ld_data = json.loads(script.string)
        else:
            ld_data = {}

        title = soup.select_one('h1[data-testid="bookTitle"]')
        author = soup.select_one('.ContributorLink__name')
        rating = soup.select_one('.RatingStatistics__rating')
        ratings_count = soup.select_one('[data-testid="ratingsCount"]')
        description = soup.select_one('[data-testid="description"]')

        return {
            'title': title.get_text(strip=True) if title else '',
            'author': author.get_text(strip=True) if author else '',
            'rating': float(rating.get_text(strip=True)) if rating else 0,
            'ratings_count': ratings_count.get_text(strip=True) if ratings_count else '',
            'description': description.get_text(strip=True) if description else '',
            'isbn': ld_data.get('isbn', ''),
            'pages': ld_data.get('numberOfPages', ''),
        }

    def search_books(self, query, pages=1):
        """Search for books by title, author, or ISBN."""
        results = []
        for page in range(1, pages + 1):
            url = f"{self.BASE_URL}/search"
            params = {'q': query, 'page': page}
            resp = self.session.get(url, params=params)
            soup = BeautifulSoup(resp.text, 'html.parser')

            book_rows = soup.select('tr[itemtype="http://schema.org/Book"]')
            for row in book_rows:
                title_el = row.select_one('a.bookTitle')
                author_el = row.select_one('a.authorName')
                rating_el = row.select_one('.minirating')

                if title_el:
                    results.append({
                        'title': title_el.get_text(strip=True),
                        'url': self.BASE_URL + title_el['href'],
                        'author': author_el.get_text(strip=True) if author_el else '',
                        'rating_info': rating_el.get_text(strip=True) if rating_el else '',
                    })
            time.sleep(2)

        return results

Building a Reading Tracker

import pandas as pd
from datetime import datetime

class ReadingTracker:
    def __init__(self, scraper, db_file='reading_list.csv'):
        self.scraper = scraper
        self.db_file = db_file

    def add_book(self, query):
        """Search and add a book to tracking list."""
        results = self.scraper.search_books(query, pages=1)
        if not results:
            print(f"No books found for '{query}'")
            return None

        book = results[0]
        details = self.scraper.get_book_details(book['url'])

        record = {
            'added_date': datetime.now().isoformat(),
            'title': details['title'],
            'author': details['author'],
            'rating': details['rating'],
            'pages': details['pages'],
            'status': 'to_read',
            'url': book['url'],
        }

        df = pd.DataFrame([record])
        df.to_csv(self.db_file, mode='a', header=False, index=False)
        print(f"Added: {details['title']} by {details['author']} ({details['rating']}⭐)")
        return record

    def get_recommendations(self, min_rating=4.0, genre=None):
        """Get book recommendations based on reading history."""
        df = pd.read_csv(self.db_file,
                        names=['added_date', 'title', 'author', 'rating',
                               'pages', 'status', 'url'])

        # Find highly rated authors in your list
        top_authors = df[df.rating >= min_rating]['author'].unique()
        print(f"Your top-rated authors: {', '.join(top_authors[:5])}")

        # Search for more books by those authors
        recommendations = []
        for author in top_authors[:3]:
            results = self.scraper.search_books(author, pages=1)
            for book in results:
                if book['title'] not in df['title'].values:
                    recommendations.append(book)
            time.sleep(2)

        return recommendations[:10]

Building a Simple Recommendation Engine

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_recommendation_engine(books_data):
    """Build content-based recommendations from book descriptions."""
    df = pd.DataFrame(books_data)

    # Create TF-IDF vectors from descriptions
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(df['description'].fillna(''))

    # Compute similarity matrix
    similarity = cosine_similarity(tfidf_matrix)

    def get_similar_books(title, top_n=5):
        idx = df[df['title'] == title].index
        if len(idx) == 0:
            return []
        idx = idx[0]
        scores = list(enumerate(similarity[idx]))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        return [(df.iloc[i]['title'], score) for i, score in scores]

    return get_similar_books

# Usage
recommender = build_recommendation_engine(books_data)
similar = recommender("Project Hail Mary")
for title, score in similar:
    print(f"  {title} (similarity: {score:.2f})")

Scaling with Cloud Solutions

For large-scale book data collection — entire genres, bestseller lists, or author catalogs — the Goodreads Scraper on Apify handles the heavy lifting, managing sessions and anti-bot detection automatically.

For reliable proxy rotation during data collection, ScraperAPI simplifies the proxy management with a single API endpoint.

Conclusion

Goodreads data opens up powerful applications — from personal reading trackers to recommendation engines to literary trend analysis. The combination of structured book metadata and rich review text makes it ideal for both simple tracking tools and sophisticated ML-powered recommendations. Start with manual scraping for your personal library, then scale to cloud scrapers when you need broader coverage.