Steam Game Data Scraping: Build a Game Database with Python

#python #webdev #tutorial #webscraping

Steam is the largest PC gaming marketplace, and its data is perfect for building game databases, tracking price histories, analyzing trends, and finding deals. Here's how to extract it all with Python.

What Data Can You Extract?

Game titles, descriptions, and genres
Current and historical prices
User reviews and ratings
Player count statistics
DLC and bundle information
Tag and category data

Using the Steam API

Steam provides a public API for some data. Start here before scraping:

import requests
import time
import json

class SteamDataCollector:
    """Collect game data from Steam's public endpoints."""

    API_BASE = "https://store.steampowered.com/api"
    STORE_BASE = "https://store.steampowered.com"

    def __init__(self):
        self.session = requests.Session()

    def get_app_details(self, app_id):
        """Get detailed info for a specific game."""
        url = f"{self.API_BASE}/appdetails"
        params = {'appids': app_id, 'cc': 'us', 'l': 'en'}

        resp = self.session.get(url, params=params)
        data = resp.json()

        if str(app_id) in data and data[str(app_id)]['success']:
            return data[str(app_id)]['data']
        return None

    def get_app_reviews(self, app_id, count=100):
        """Get user reviews for a game."""
        url = f"{self.STORE_BASE}/appreviews/{app_id}"
        params = {
            'json': 1,
            'num_per_page': min(count, 100),
            'purchase_type': 'all',
            'language': 'english',
        }

        resp = self.session.get(url, params=params)
        return resp.json().get('reviews', [])

    def get_all_apps(self):
        """Get the full list of Steam app IDs and names."""
        url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"
        resp = self.session.get(url)
        apps = resp.json()['applist']['apps']
        return {app['appid']: app['name'] for app in apps if app['name']}

Building a Game Database

import sqlite3
import pandas as pd

def build_game_database(collector, app_ids, db_path='steam_games.db'):
    """Collect and store data for multiple games."""
    conn = sqlite3.connect(db_path)

    games = []
    for i, app_id in enumerate(app_ids):
        details = collector.get_app_details(app_id)
        if not details:
            continue

        price_data = details.get('price_overview', {})

        game = {
            'app_id': app_id,
            'name': details.get('name', ''),
            'type': details.get('type', ''),
            'description': details.get('short_description', ''),
            'developers': ', '.join(details.get('developers', [])),
            'publishers': ', '.join(details.get('publishers', [])),
            'price_cents': price_data.get('final', 0),
            'price_formatted': price_data.get('final_formatted', 'Free'),
            'discount_percent': price_data.get('discount_percent', 0),
            'metacritic': details.get('metacritic', {}).get('score', None),
            'genres': ', '.join([g['description'] for g in details.get('genres', [])]),
            'release_date': details.get('release_date', {}).get('date', ''),
            'positive_reviews': details.get('recommendations', {}).get('total', 0),
        }
        games.append(game)

        if (i + 1) % 10 == 0:
            print(f"Collected {i + 1}/{len(app_ids)} games")

        time.sleep(1.5)  # Steam rate limit: ~200 requests per 5 minutes

    df = pd.DataFrame(games)
    df.to_sql('games', conn, if_exists='replace', index=False)
    conn.close()
    print(f"Stored {len(games)} games in database")
    return df

# Collect data for popular games
collector = SteamDataCollector()
popular_ids = [730, 570, 440, 578080, 1172470, 892970, 1245620]
db = build_game_database(collector, popular_ids)

Price Tracking

Track game prices to find the best deals:

def track_prices(collector, watchlist, history_file='price_history.jsonl'):
    """Record current prices for watched games."""
    from datetime import datetime
    timestamp = datetime.now().isoformat()

    for app_id in watchlist:
        details = collector.get_app_details(app_id)
        if not details:
            continue

        price_data = details.get('price_overview', {})
        record = {
            'timestamp': timestamp,
            'app_id': app_id,
            'name': details.get('name', ''),
            'price_cents': price_data.get('final', 0),
            'discount': price_data.get('discount_percent', 0),
        }

        with open(history_file, 'a') as f:
            f.write(json.dumps(record) + '\n')

        if price_data.get('discount_percent', 0) > 50:
            print(f"🔥 DEAL: {details['name']} is {price_data['discount_percent']}% off!")

        time.sleep(1)

# Check prices for your wishlist
track_prices(collector, [730, 570, 578080, 1172470])

Review Sentiment Analysis

from collections import Counter

def analyze_reviews(collector, app_id, count=100):
    """Analyze review sentiment and common themes."""
    reviews = collector.get_app_reviews(app_id, count)

    positive = sum(1 for r in reviews if r.get('voted_up'))
    negative = len(reviews) - positive

    # Extract common words from negative reviews
    negative_reviews = [r['review'] for r in reviews if not r.get('voted_up')]
    all_words = ' '.join(negative_reviews).lower().split()
    common_complaints = Counter(all_words).most_common(20)

    print(f"Positive: {positive}, Negative: {negative}")
    print(f"Approval rate: {positive/len(reviews)*100:.1f}%")
    print(f"Common complaint words: {common_complaints[:10]}")

    return {'positive': positive, 'negative': negative, 'reviews': reviews}

Scaling with Cloud Solutions

For comprehensive Steam data across thousands of games, the Steam Scraper on Apify automates the entire collection process — handling rate limits, pagination, and data structuring.

When collecting data at scale, reliable proxies are key. ThorData provides rotating residential proxies that prevent IP blocks during large-scale collection runs.

Conclusion

Steam's data is remarkably accessible through its API and web store. Whether you're building a deal tracker, a game database, or a review analyzer, start with the API for structured data, then supplement with web scraping for additional details. The combination of Steam's public endpoints and Python's data analysis tools makes it straightforward to build powerful gaming data applications.

Skip the Build

You don't have to reinvent this. We maintain a production-grade scraper as an Apify actor — proxies, anti-bot, retries, and schema all handled. You can run it on a pay-per-result basis and get clean JSON without writing a single line of scraping code.

Steam Scraper on Apify