The Hook: Why Sports Data Matters More Than Ever
You're sitting in your living room watching a soccer match, and the commentator mentions that a player's expected goals (xG) is 0.45. But what does that actually mean? How is it calculated? And more importantly, how can you extract actionable insights from this data yourself?
Sports data analysis has evolved from a niche hobby into a multi-billion dollar industry. Teams like Liverpool FC famously use advanced analytics to identify undervalued talent, while betting syndicates use data pipelines to spot market inefficiencies. The barrier to entry? It's never been lower. Today, you can build a professional-grade sports analytics pipeline using open data and Python in an afternoon.
In this tutorial, we'll build a complete data pipeline that fetches soccer match data from StatsBomb's open API, processes it with pandas, and extracts meaningful insights using visualization and statistical analysis. By the end, you'll have a reusable framework for analyzing thousands of matches.
Part 1: Understanding Your Data Sources
Before writing a single line of code, let's discuss where sports data lives.
Public Sports Data Sources
StatsBomb Open Data
StatsBomb provides free, detailed event-level data for hundreds of soccer matches. This includes shot locations, passes, tackles, and more. It's the gold standard for free soccer analytics and what we'll use today.
Other Notable Sources:
- Understat.com: Expected goals (xG) and defensive metrics (requires web scraping)
- FiveThirtyEight: Historical rating data and match predictions
- Kaggle: Pre-packaged datasets from various sports
- Wyscout: Professional video analysis platform (API available for institutions)
- World Football Database: Historical match results and team statistics
For this tutorial, StatsBomb's API is ideal because it's:
- Completely free and legal
- Well-documented
- Provides granular event-level data
- Actively maintained
Part 2: Setting Up Your Environment
Installing Required Libraries
pip install pandas numpy matplotlib seaborn requests statsbomb
Let's understand what each library does:
import pandas as pd # Data manipulation and analysis
import numpy as np # Numerical computing
import matplotlib.pyplot as plt # Data visualization
import seaborn as sns # Statistical visualization
import requests # HTTP requests for APIs
from datetime import datetime # Time handling
import json # JSON parsing
Project Structure
Create a well-organized project:
sports-analytics/
├── main.py
├── config.py
├── data/
│ ├── raw/
│ └── processed/
├── notebooks/
├── visualizations/
└── README.md
Configuration File
# config.py
STATSBOMB_API_BASE = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
DATA_DIR = "data/raw"
PROCESSED_DIR = "data/processed"
# Analysis parameters
MIN_SHOTS_FOR_ANALYSIS = 5
MATCH_LIMIT = 100 # Start small for testing
Part 3: Building the Data Pipeline
Step 1: Fetching Match Data
# data_fetcher.py
import requests
import pandas as pd
import json
from config import STATSBOMB_API_BASE, DATA_DIR
class StatsBombFetcher:
def __init__(self, base_url=STATSBOMB_API_BASE):
self.base_url = base_url
def fetch_competitions(self):
"""Fetch available competitions"""
url = f"{self.base_url}/competitions.json"
response = requests.get(url)
return pd.DataFrame(response.json())
def fetch_matches(self, competition_id, season_id):
"""Fetch matches for a specific competition and season"""
url = f"{self.base_url}/matches/{competition_id}/{season_id}.json"
response = requests.get(url)
matches = response.json()
return pd.DataFrame(matches)
def fetch_events(self, match_id):
"""Fetch detailed events for a match"""
url = f"{self.base_url}/events/{match_id}.json"
response = requests.get(url)
return response.json()
# Usage
fetcher = StatsBombFetcher()
# Find competitions
competitions = fetcher.fetch_competitions()
print(competitions[['competition_id', 'competition_name', 'season_name']].head())
# Get La Liga matches from 2020-21
laliga_matches = fetcher.fetch_matches(competition_id=12, season_id=90)
print(f"Fetched {len(laliga_matches)} matches")
Step 2: Processing Raw Data with Pandas
# data_processor.py
import pandas as pd
from config import PROCESSED_DIR
class MatchProcessor:
def __init__(self):
self.matches_df = None
self.events_df = None
def load_and_clean_matches(self, matches_json):
"""Convert match JSON to clean DataFrame"""
matches = []
for match in matches_json:
matches.append({
'match_id': match['id'],
'match_date': pd.to_datetime(match['match_date']),
'home_team': match['home_team']['name'],
'away_team': match['away_team']['name'],
'home_score': match['home_score'],
'away_score': match['away_score'],
'competition': match['competition']['name'],
'season': match['season']['season_name']
})
self.matches_df = pd.DataFrame(matches)
return self.matches_df
def parse_events(self, events_json, match_id):
"""Parse events JSON into structured DataFrame"""
events = []
for event in events_json:
event_dict = {
'match_id': match_id,
'timestamp': event['timestamp'],
'minute': event['minute'],
'second': event['second'],
'type': event['type']['name'],
'team': event['team']['name'],
'player': event.get('player', {}).get('name', 'Unknown'),
'position': event.get('position', {}).get('name', 'Unknown'),
}
# Handle event-specific data
if event['type']['name'] == 'Shot':
shot = event.get('shot', {})
event_dict.update({
'shot_outcome': shot.get('outcome', {}).get('name'),
'shot_xg': shot.get('expected_goals', None),
'x': shot.get('statsbomb_xg2', None),
'y': shot.get('location', [None, None])[0],
})
if event['type']['name'] == 'Pass':
pass_event = event.get('pass', {})
event_dict.update({
'pass_length': pass_event.get('length'),
'pass_angle': pass_event.get('angle'),
'pass_outcome': pass_event.get('outcome', {}).get('name', 'Successful'),
})
events.append(event_dict)
return pd.DataFrame(events)
# Usage
processor = MatchProcessor()
matches_clean = processor.load_and_clean_matches(laliga_matches)
print(matches_clean.head())
Step 3: Complete Data Pipeline Integration
# pipeline.py
import pandas as pd
from data_fetcher import StatsBombFetcher
from data_processor import MatchProcessor
from config import PROCESSED_DIR
class DataPipeline:
def __init__(self):
self.fetcher = StatsBombFetcher()
self.processor = MatchProcessor()
self.all_events = None
def run(self, competition_id, season_id, limit=None):
"""Execute complete pipeline"""
print("Fetching matches...")
matches_json = self.fetcher.fetch_matches(competition_id, season_id)
if limit:
matches_json = matches_json.iloc[:limit]
matches_df = self.processor.load_and_clean_matches(matches_json)
print(f"Loaded {len(matches_df)} matches")
# Fetch and process all events
all_events = []
for idx, match in matches_df.iterrows():
try:
events_json = self.fetcher.fetch_events(match['match_id'])
events_df = self.processor.parse_events(events_json, match['match_id'])
all_events.append(events_df)
if (idx + 1) % 10 == 0:
print(f"Processed {idx + 1} matches...")
except Exception as e:
print(f"Error processing match {match['match_id']}: {e}")
self.all_events = pd.concat(all_events, ignore_index=True)
# Save processed data
matches_df.to_csv(f"{PROCESSED_DIR}/matches.csv", index=False)
self.all_events.to_csv(f"{PROCESSED_DIR}/events.csv", index=False)
return matches_df, self.all_events
# Execute pipeline
pipeline = DataPipeline()
matches, events = pipeline.run(competition_id=12, season_id=90, limit=50)
Part 4: Exploratory Data Analysis
Analyzing Shot Data and Expected Goals
python
# analysis.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load processed data
matches = pd.read_csv('data/processed/matches.csv')
events = pd.read_csv('data/processed/events.csv')
# Filter shots
shots = events[events['type'] == 'Shot'].copy()
shots = shots.dropna(subset=['shot_xg'])
print(f"Total shots analyzed: {len(shots)}")
print(f"\nShot outcome distribution:")
print(shots['shot_outcome'].value_counts())
# Team-level shot analysis
team_shots = shots.groupby('team').agg({
'shot_xg': ['sum', 'mean', 'count'],
'shot_outcome': lambda x: (x == 'Goal').sum()
}).round(2)
team_shots.columns = ['Total xG', 'Average xG', 'Shots', 'Goals']
team_shots['Efficiency %'] = (team_shots['Goals'] / team_shots['Shots'] * 100).round(1)
team_shots = team_shots.sort_values('Total xG', ascending=False)
print("\n=== Team Shooting Performance ===")
print(team_sh
Top comments (0)