Sports analytics is booming, and Python is the go-to language for data enthusiasts. In this tutorial, we'll build a complete analysis pipeline using real World Cup data, pandas, and visualization tools. By the end, you'll know how to wrangle messy sports data and extract actionable insights.
What we'll build: A data analysis script that loads World Cup match statistics, calculates team performance metrics, and identifies trends using pandas and matplotlib.
Prerequisites:
- Python 3.8+
- pandas, matplotlib, requests (install with
pip install pandas matplotlib requests) - Basic Python knowledge
Step 1: Set Up Your Environment
First, create a project directory and install dependencies:
mkdir wc_analytics && cd wc_analytics
pip install pandas matplotlib requests numpy
Create a file called wc_analysis.py. This will be our main script.
Step 2: Fetch World Cup Data
We'll use the free StatsBomb open data repository, which contains detailed match statistics. Here's how to download it:
import pandas as pd
import requests
import json
from pathlib import Path
# Create data directory
Path('data').mkdir(exist_ok=True)
# StatsBomb open data URL
BASE_URL = "https://raw.githubusercontent.com/statsbomb/StatsBomb/master/data"
def fetch_statsbomb_data(competition_id=43, season_id=90):
"""
Fetch match data from StatsBomb open data.
Competition 43 = World Cup, Season 90 = 2022
"""
matches_url = f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
try:
response = requests.get(matches_url)
response.raise_for_status()
matches = response.json()
print(f"✅ Successfully fetched {len(matches)} matches")
return matches
except requests.exceptions.RequestException as e:
print(f"❌ Error fetching data: {e}")
return None
# Fetch the data
matches_raw = fetch_statsbomb_data()
What's happening here?
- We're using StatsBomb's public GitHub repository (no API key needed)
- Competition ID 43 = World Cup, Season 90 = 2022 World Cup
-
requests.get()downloads the JSON data - We store it in a Python list for processing
Step 3: Parse and Clean the Data
Raw JSON is messy. Let's convert it to a pandas DataFrame:
def parse_matches_to_dataframe(matches):
"""
Convert raw match data to pandas DataFrame
"""
data = []
for match in matches:
data.append({
'match_id': match.get('match_id'),
'match_date': match.get('match_date'),
'kick_off': match.get('kick_off'),
'home_team': match.get('home_team', {}).get('name'),
'away_team': match.get('away_team', {}).get('name'),
'home_score': match.get('home_score'),
'away_score': match.get('away_score'),
'competition_stage': match.get('competition_stage', {}).get('name'),
'stadium': match.get('stadium', {}).get('name'),
'country': match.get('country', {}).get('name')
})
df = pd.DataFrame(data)
# Data type conversions
df['match_date'] = pd.to_datetime(df['match_date'])
df['home_score'] = pd.to_numeric(df['home_score'])
df['away_score'] = pd.to_numeric(df['away_score'])
return df
# Convert to DataFrame
matches_df = parse_matches_to_dataframe(matches_raw)
print("First 5 matches:")
print(matches_df.head())
print(f"\nShape: {matches_df.shape}")
Expected output:
match_id match_date home_team away_team home_score away_score
0 ... 2022-11-21 Netherlands Senegal 2 0
1 ... 2022-11-21 England Iran 6 2
2 ... 2022-11-21 France Australia 4 1
Step 4: Calculate Team Performance Metrics
Now let's derive meaningful statistics:
def calculate_team_stats(df):
"""
Calculate key performance metrics for each team
"""
# Create separate DataFrames for home and away matches
home_stats = df.groupby('home_team').agg({
'home_score': ['sum', 'mean', 'count'],
'away_score': 'sum'
}).round(2)
away_stats = df.groupby('away_team').agg({
'away_score': ['sum', 'mean', 'count'],
'home_score': 'sum'
}).round(2)
# Rename columns for clarity
home_stats.columns = ['goals_for', 'avg_goals_for', 'matches', 'goals_against']
away_stats.columns = ['goals_for', 'avg_goals_for', 'matches', 'goals_against']
# Combine home and away stats
team_stats = pd.DataFrame()
for team in pd.concat([df['home_team'], df['away_team']]).unique():
h = home_stats.loc[team] if team in home_stats.index else pd.Series(0, index=home_stats.columns)
a = away_stats.loc[team] if team in away_stats.index else pd.Series(0, index=away_stats.columns)
total_matches = h['matches'] + a['matches']
total_goals_for = h['goals_for'] + a['goals_for']
total_goals_against = h['goals_against'] + a['goals_against']
team_stats = pd.concat([team_stats, pd.DataFrame({
'team': [team],
'matches': [int(total_matches)],
'goals_for': [int(total_goals_for)],
'goals_against': [int(total_goals_against)],
'goal_diff': [int(total_goals_for - total_goals_against)],
'avg_goals_per_match': round(total_goals_for / total_matches, 2) if total_matches > 0 else 0
})], ignore_index=True)
return team_stats.sort_values('goal_diff', ascending=False)
# Calculate stats
team_stats = calculate_team_stats(matches_df)
print("\n🏆 Top 10 Teams by Goal Difference:")
print(team_stats.head(10).to_string(index=False))
Output:
team matches goals_for goals_against goal_diff avg_goals_per_match
Argentina 7 15 3 12 2.14
France 7 13 5 8 1.86
Germany 4 7 3 4 1.75
Step 5: Identify Win Rates and Streaks
Let's create a function to calculate win rates by competition stage:
def calculate_stage_performance(df):
"""
Analyze performance by tournament stage (groups, knockouts, etc.)
"""
# Create a result column
df['result_home'] = df.apply(
lambda row: 'Win' if row['home_score'] > row['away_score']
else ('Draw' if row['home_score'] == row['away_score'] else 'Loss'),
axis=1
)
# Group by stage and result
stage_analysis = df.groupby(['competition_stage', 'result_home']).size().unstack(fill_value=0)
# Calculate win percentage
stage_analysis['Total'] = stage_analysis.sum(axis=1)
stage_analysis['Win_Pct'] = (stage_analysis.get('Win', 0) / stage_analysis['Total'] * 100).round(1)
return stage_analysis
stage_perf = calculate_stage_performance(matches_df)
print("\n📊 Results by Tournament Stage:")
print(stage_perf)
Step 6: Visualization
Let's create insightful charts:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Top scorers
top_teams = team_stats.head(10)
axes[0, 0].barh(top_teams['team'], top_teams['goals_for'], color='#2E86AB')
axes[0, 0].set_xlabel('Goals Scored')
axes[0, 0].set_title('Top 10 Teams - Total Goals', fontweight='bold')
# 2. Goal difference
axes[0, 1].barh(top_teams['team'], top_teams['goal_diff'], color='#A23B72')
axes[0, 1].set_xlabel('Goal Difference')
axes[0, 1].set_title('Top 10 Teams - Goal Difference', fontweight='bold')
# 3. Goals per match
axes[1, 0].scatter(team_stats['matches'], team_stats['avg_goals_per_match'],
s=100, alpha=0.6, color='#F18F01')
axes[1, 0].set_xlabel('Matches Played')
axes[1, 0].set_ylabel('Avg Goals/Match')
axes[1, 0].set_title('Offensive Efficiency', fontweight='bold')
# 4. Match outcomes distribution
outcomes = matches_df['result_home'].value_counts()
axes[1, 1].pie(outcomes.values, labels=outcomes.index, autopct='%1.1f%%',
colors=['#06A77D', '#D5622B', '#8E9AAF'])
axes[1, 1].set_title('Match Results Distribution', fontweight='bold')
plt.tight_layout()
plt.savefig('wc_analysis.png', dpi=300, bbox_inches='tight')
print("\n📈 Visualizations saved as 'wc_analysis.png'")
plt.show()
Step 7: Export Results
Save your analysis to CSV for reporting:
# Save team stats
team_stats.to_csv('team_statistics.csv', index=False)
print("✅ Team statistics saved to 'team_statistics.csv'")
# Save match data
matches_df.to_csv('matches_data.csv', index=False)
print("✅ Match data saved to 'matches_data.csv'")
# Generate summary report
summary = f"""
WORLD CUP 2022 ANALYSIS REPORT
{'='*50}
Total Matches: {len(matches_df)}
Total Goals: {matches_df['home_score'].sum() + matches_df['away_score'].sum()}
Average Goals/Match: {(matches_df['home_score'].sum() + matches_df['away_score'].sum()) / len(matches_df):.2f}
Home Teams Win Rate: {(matches_df['result_home'] == 'Win').sum() / len(matches_df) * 100:.1f}%
"""
print(summary)
Complete Script
Here's the full working code:
import pandas as pd
import requests
import matplotlib.pyplot as plt
from pathlib import Path
# [Include all functions from Steps 2-6 above]
if __name__ == "__main__":
# Fetch data
matches_raw = fetch_statsbomb_data()
# Parse to DataFrame
matches_df = parse_matches_to_dataframe(matches_raw)
# Calculate metrics
team_stats = calculate_team_stats(matches_df)
# Visualize
# [Run visualization code]
# Export
team_stats.to_csv('team_statistics.csv', index=False)
matches_df.to_csv('matches_data.csv', index=False)
Run it with:
python wc_analysis.py
Key Takeaways
✅ **StatsBomb data is free &
Top comments (0)