Steam is the largest PC gaming marketplace, and its data is perfect for building game databases, tracking price histories, analyzing trends, and finding deals. Here's how to extract it all with Python.
What Data Can You Extract?
- Game titles, descriptions, and genres
- Current and historical prices
- User reviews and ratings
- Player count statistics
- DLC and bundle information
- Tag and category data
Using the Steam API
Steam provides a public API for some data. Start here before scraping:
import requests
import time
import json
class SteamDataCollector:
"""Collect game data from Steam's public endpoints."""
API_BASE = "https://store.steampowered.com/api"
STORE_BASE = "https://store.steampowered.com"
def __init__(self):
self.session = requests.Session()
def get_app_details(self, app_id):
"""Get detailed info for a specific game."""
url = f"{self.API_BASE}/appdetails"
params = {'appids': app_id, 'cc': 'us', 'l': 'en'}
resp = self.session.get(url, params=params)
data = resp.json()
if str(app_id) in data and data[str(app_id)]['success']:
return data[str(app_id)]['data']
return None
def get_app_reviews(self, app_id, count=100):
"""Get user reviews for a game."""
url = f"{self.STORE_BASE}/appreviews/{app_id}"
params = {
'json': 1,
'num_per_page': min(count, 100),
'purchase_type': 'all',
'language': 'english',
}
resp = self.session.get(url, params=params)
return resp.json().get('reviews', [])
def get_all_apps(self):
"""Get the full list of Steam app IDs and names."""
url = "https://api.steampowered.com/ISteamApps/GetAppList/v2/"
resp = self.session.get(url)
apps = resp.json()['applist']['apps']
return {app['appid']: app['name'] for app in apps if app['name']}
Building a Game Database
import sqlite3
import pandas as pd
def build_game_database(collector, app_ids, db_path='steam_games.db'):
"""Collect and store data for multiple games."""
conn = sqlite3.connect(db_path)
games = []
for i, app_id in enumerate(app_ids):
details = collector.get_app_details(app_id)
if not details:
continue
price_data = details.get('price_overview', {})
game = {
'app_id': app_id,
'name': details.get('name', ''),
'type': details.get('type', ''),
'description': details.get('short_description', ''),
'developers': ', '.join(details.get('developers', [])),
'publishers': ', '.join(details.get('publishers', [])),
'price_cents': price_data.get('final', 0),
'price_formatted': price_data.get('final_formatted', 'Free'),
'discount_percent': price_data.get('discount_percent', 0),
'metacritic': details.get('metacritic', {}).get('score', None),
'genres': ', '.join([g['description'] for g in details.get('genres', [])]),
'release_date': details.get('release_date', {}).get('date', ''),
'positive_reviews': details.get('recommendations', {}).get('total', 0),
}
games.append(game)
if (i + 1) % 10 == 0:
print(f"Collected {i + 1}/{len(app_ids)} games")
time.sleep(1.5) # Steam rate limit: ~200 requests per 5 minutes
df = pd.DataFrame(games)
df.to_sql('games', conn, if_exists='replace', index=False)
conn.close()
print(f"Stored {len(games)} games in database")
return df
# Collect data for popular games
collector = SteamDataCollector()
popular_ids = [730, 570, 440, 578080, 1172470, 892970, 1245620]
db = build_game_database(collector, popular_ids)
Price Tracking
Track game prices to find the best deals:
def track_prices(collector, watchlist, history_file='price_history.jsonl'):
"""Record current prices for watched games."""
from datetime import datetime
timestamp = datetime.now().isoformat()
for app_id in watchlist:
details = collector.get_app_details(app_id)
if not details:
continue
price_data = details.get('price_overview', {})
record = {
'timestamp': timestamp,
'app_id': app_id,
'name': details.get('name', ''),
'price_cents': price_data.get('final', 0),
'discount': price_data.get('discount_percent', 0),
}
with open(history_file, 'a') as f:
f.write(json.dumps(record) + '\n')
if price_data.get('discount_percent', 0) > 50:
print(f"🔥 DEAL: {details['name']} is {price_data['discount_percent']}% off!")
time.sleep(1)
# Check prices for your wishlist
track_prices(collector, [730, 570, 578080, 1172470])
Review Sentiment Analysis
from collections import Counter
def analyze_reviews(collector, app_id, count=100):
"""Analyze review sentiment and common themes."""
reviews = collector.get_app_reviews(app_id, count)
positive = sum(1 for r in reviews if r.get('voted_up'))
negative = len(reviews) - positive
# Extract common words from negative reviews
negative_reviews = [r['review'] for r in reviews if not r.get('voted_up')]
all_words = ' '.join(negative_reviews).lower().split()
common_complaints = Counter(all_words).most_common(20)
print(f"Positive: {positive}, Negative: {negative}")
print(f"Approval rate: {positive/len(reviews)*100:.1f}%")
print(f"Common complaint words: {common_complaints[:10]}")
return {'positive': positive, 'negative': negative, 'reviews': reviews}
Scaling with Cloud Solutions
For comprehensive Steam data across thousands of games, the Steam Scraper on Apify automates the entire collection process — handling rate limits, pagination, and data structuring.
When collecting data at scale, reliable proxies are key. ThorData provides rotating residential proxies that prevent IP blocks during large-scale collection runs.
Conclusion
Steam's data is remarkably accessible through its API and web store. Whether you're building a deal tracker, a game database, or a review analyzer, start with the API for structured data, then supplement with web scraping for additional details. The combination of Steam's public endpoints and Python's data analysis tools makes it straightforward to build powerful gaming data applications.
Top comments (0)