Building a Sports Data Scraper: Stats, Odds, and Live Scores

#python #tutorial #webdev #programming

Sports data is big business. From fantasy sports to betting analytics, there's massive demand for real-time statistics, historical data, and odds comparison.

Scraping Sports Statistics

import requests
from bs4 import BeautifulSoup

class SportsStats:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})

    def scrape_standings(self):
        url = 'https://www.basketball-reference.com/leagues/NBA_2026_standings.html'
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        standings = []
        table = soup.select_one('#confs_standings_E')
        if table:
            for row in table.select('tbody tr.full_table'):
                cols = row.select('td')
                team = row.select_one('th a')
                if team and len(cols) >= 3:
                    standings.append({
                        'team': team.get_text(strip=True),
                        'wins': cols[0].get_text(strip=True),
                        'losses': cols[1].get_text(strip=True)
                    })
        return standings

Live Score Tracking

from datetime import datetime
import time

class LiveScores:
    def get_scores(self):
        url = 'https://www.thesportsdb.com/api/v1/json/3/eventsday.php'
        resp = requests.get(url, params={
            'd': datetime.now().strftime('%Y-%m-%d'), 's': 'NBA'
        })
        return [{
            'home': e.get('strHomeTeam'),
            'away': e.get('strAwayTeam'),
            'home_score': e.get('intHomeScore'),
            'away_score': e.get('intAwayScore'),
            'status': e.get('strStatus')
        } for e in resp.json().get('events', []) or []]

    def monitor(self, interval=30):
        while True:
            for g in self.get_scores():
                print(f"{g['home']} {g['home_score']} - {g['away_score']} {g['away']}")
            time.sleep(interval)

Odds Comparison

class OddsScraper:
    def get_odds(self, api_key, sport='basketball_nba'):
        url = f'https://api.the-odds-api.com/v4/sports/{sport}/odds'
        resp = requests.get(url, params={
            'apiKey': api_key, 'regions': 'us',
            'markets': 'h2h,spreads', 'oddsFormat': 'american'
        })
        games = []
        for g in resp.json():
            odds = {}
            for bk in g.get('bookmakers', []):
                for mkt in bk['markets']:
                    if mkt['key'] == 'h2h':
                        for o in mkt['outcomes']:
                            odds[f"{bk['title']}_{o['name']}"] = o['price']
            games.append({'home': g['home_team'], 'away': g['away_team'], 'odds': odds})
        return games

Data Storage

import sqlite3

class SportsDB:
    def __init__(self, path='sports.db'):
        self.conn = sqlite3.connect(path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS games (
            id INTEGER PRIMARY KEY, sport TEXT, home TEXT, away TEXT,
            home_score INTEGER, away_score INTEGER, game_date DATE)''')

    def save(self, game):
        self.conn.execute('INSERT INTO games (sport, home, away, home_score, away_score, game_date) VALUES (?,?,?,?,?,?)',
            (game['sport'], game['home'], game['away'], game['home_score'], game['away_score'], game['date']))
        self.conn.commit()

Scaling

ScraperAPI handles proxy rotation for stats sites. ThorData provides fast residential proxies, and ScrapeOps monitors pipeline health.

Conclusion

Build a solid database layer, use APIs where available, and complement with scraping for unavailable data.

DEV Community