Part 4 of 5 | ← Part 3 | Part 5 → | View Series
The Architecture
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI(
title="IPL AI Assistant",
description="Predictions + Q&A for cricket",
version="1.0"
)
# Allow frontend to call backend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
Lazy Loading: The Secret Sauce
Instead of loading models at startup:
# ❌ Wrong way (startup)
model = joblib.load("models/model.joblib") # 500ms
qa_index = joblib.load("models/qa_model.joblib") # 400ms
# Server ready: 900ms
# ✅ Right way (lazy)
BUNDLE = None
QA_BUNDLE = None
def get_bundle():
global BUNDLE
if BUNDLE is None:
BUNDLE = joblib.load("models/model.joblib") # On first use
print("[INFO] ML model loaded")
return BUNDLE
def get_qa():
global QA_BUNDLE
if QA_BUNDLE is None:
QA_BUNDLE = joblib.load("models/qa_model.joblib") # On first use
print("[INFO] Q&A index loaded")
return QA_BUNDLE
Benefits:
- Faster startup (150ms vs 900ms)
- Serverless-friendly (AWS Lambda, Google Cloud)
- Optional loading (if users never ask, models never load)
Cost savings: If you deploy 100 servers, save 100 × 750ms = 75 seconds total startup.
Endpoint 1: Health Check
@app.get("/health")
def health_check():
"""Called by load balancers every 10 seconds."""
return {
"status": "ok",
"version": "1.0",
"timestamp": datetime.utcnow().isoformat()
}
If this returns 200, traffic flows. If timeout, traffic reroutes. Fast failure detection.
Endpoint 2: Metadata
@app.get("/model-info")
def model_info():
"""Tell clients what models are running."""
bundle = get_bundle() # First call triggers load
return {
"ml_model": {
"type": "GradientBoostingClassifier",
"test_accuracy": 0.618,
"features": 13,
},
"qa_engine": {
"qa_pairs": 42523,
"vocabulary": 18394,
"threshold": 0.15,
},
}
Clients call this before predictions: "Are you ready? What accuracy?"
Endpoint 3: Raw Prediction
from pydantic import BaseModel
class PredictionRequest(BaseModel):
batting_team: str
bowling_team: str
venue: str
h2h_rate: float # 0.0-1.0
overall_rate: float # 0.0-1.0
venue_rate: float # 0.0-1.0
rolling_rate: float # 0.0-1.0
toss_win: int # 0 or 1
toss_choice: str # "bat" or "field"
@app.post("/predict")
def predict_winner(request: PredictionRequest):
"""Raw ML model prediction."""
bundle = get_bundle()
# Create feature vector
df = pd.DataFrame([{
"batting_team": request.batting_team,
"bowling_team": request.bowling_team,
"venue": request.venue,
"h2h_rate": request.h2h_rate,
# ... other rates
"toss_win": request.toss_win,
"toss_choice": request.toss_choice,
}])
# Predict
prediction = bundle["pipeline"].predict(df)[0]
confidence = bundle["pipeline"].predict_proba(df)[0]
winner = bundle["label_encoder"].inverse_transform([prediction])[0]
return {
"winner": winner,
"confidence": float(max(confidence)),
"model": "GradientBoostingClassifier",
}
Example:
POST /predict
{
"batting_team": "Mumbai Indians",
"bowling_team": "Chennai Super Kings",
"venue": "Wankhede",
"h2h_rate": 0.54,
"overall_rate": 0.55,
"venue_rate": 0.60,
"rolling_rate": 0.52,
"toss_win": 1,
"toss_choice": "bat"
}
Response:
{
"winner": "Mumbai Indians",
"confidence": 0.62
}
Endpoint 4: Intelligent Chat Router
This is the magic endpoint.
class ChatRequest(BaseModel):
message: str
@app.post("/chat")
def handle_chat(request: ChatRequest):
"""
Routes to prediction or Q&A automatically.
"""
message = request.message.lower()
# Step 1: Detect intent
is_prediction = detect_prediction_intent(message)
# Step 2: Extract teams
teams = extract_teams(message)
# Step 3: Route
if is_prediction and len(teams) == 2:
return handle_prediction(teams[0], teams[1])
else:
return handle_qa(message)
Intent Detection
def detect_prediction_intent(message: str) -> bool:
"""Is user asking for a prediction?"""
keywords = [
"will", "would", "who will", "predict",
"who wins", "vs", "against", "beat",
]
return any(kw in message for kw in keywords)
Examples:
"Will MI beat KKR?" → True (has "will")
"MI vs CSK?" → True (has "vs")
"How many sixes?" → False
"Does toss matter?" → False
Team Extraction
from difflib import get_close_matches
def extract_teams(message: str) -> list:
"""Find team names, even with typos."""
bundle = get_bundle()
team_names = list(bundle["team_index"].keys())
found_teams = []
words = message.split()
# Exact matching
for team in team_names:
if team.lower() in message.lower():
found_teams.append(team)
# Fuzzy matching (for abbreviations like "mi" → "Mumbai Indians")
for word in words:
matches = get_close_matches(word, team_names, n=1, cutoff=0.8)
if matches and matches[0] not in found_teams:
found_teams.append(matches[0])
return found_teams[:2] # At most 2 teams
Examples:
"Will mi beat kkr?"
→ Fuzzy match "mi" → "Mumbai Indians"
→ Fuzzy match "kkr" → "Kolkata Knight Riders"
→ Return ["Mumbai Indians", "Kolkata Knight Riders"]
"who wins, royal challengers bangalore vs sunrisers?"
→ Exact match "royal challengers bangalore"
→ Fuzzy match "sunrisers" → "Sunrisers Hyderabad"
→ Return ["RCB", "SRH"]
Prediction Handling
def handle_prediction(team1: str, team2: str) -> dict:
"""Compute features, run model, format response."""
bundle = get_bundle()
history_df = bundle["history_df"]
# Compute live rates from historical data
h2h = compute_h2h_rate(team1, team2, history_df)
overall1 = compute_overall_rate(team1, history_df)
overall2 = compute_overall_rate(team2, history_df)
# Create prediction request
pred_req = PredictionRequest(
batting_team=team1,
bowling_team=team2,
venue="TBD",
h2h_rate=h2h,
overall_rate=overall1,
venue_rate=0.5,
rolling_rate=compute_rolling_rate(team1, history_df),
toss_win=1,
toss_choice="bat",
)
# Call /predict
prediction = predict_winner(pred_req)
# Format as chat response
return {
"type": "prediction",
"message": f"🏆 **{prediction['winner']}** likely wins ({prediction['confidence']:.0%})",
"confidence": prediction['confidence'],
}
Q&A Handling
def handle_qa(message: str) -> dict:
"""Call Q&A retrieval system."""
qa = get_qa()
answer, score = answer_question(
message,
qa["tfidf"],
qa["Q_matrix"],
qa["answers"],
threshold=0.15,
)
if answer is None:
return {
"type": "qa",
"message": "🤔 I'm not confident about that.",
"confidence": score,
}
return {
"type": "qa",
"message": answer,
"confidence": score,
}
Error Handling
@app.post("/chat")
def handle_chat(request: ChatRequest):
try:
message = request.message.lower()
# ... routing logic ...
return result
except ValueError as e:
return {"error": "Invalid input"}
except Exception as e:
# Log to monitoring (Sentry, DataDog)
logger.error(str(e))
return {"error": "Internal server error"}
Never return stack traces. Log errors, return friendly messages.
Performance Characteristics
| Endpoint | Latency | Queries/sec |
|---|---|---|
| /health | <1ms | 10,000+ |
| /model-info | 50ms | 200 |
| /predict | 10ms | 1,000 |
| /chat (ML) | 20ms | 500 |
| /chat (Q&A) | 5ms | 2,000 |
One $5 server handles 10,000 queries/day easily.
Deployment Options
Option 1: Traditional Server
pip install fastapi uvicorn scikit-learn pandas
uvicorn app:app --host 0.0.0.0 --port 8000
API available at http://localhost:8000
Option 2: Docker Container
FROM python:3.11
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
docker build -t ipl-api .
docker run -p 8000:8000 ipl-api
Option 3: AWS Lambda (Serverless)
# With Lambda, lazy loading is critical
# Models load on first invocation, persist across requests
from mangum import Mangum
handler = Mangum(app)
Deploy with SAM or Serverless Framework.
Testing the Backend
# Health check
curl http://localhost:8000/health
# {"status": "ok"}
# Model info
curl http://localhost:8000/model-info
# {"ml_model": {...}, "qa_engine": {...}}
# Prediction
curl -X POST http://localhost:8000/predict \
-H "Content-Type: application/json" \
-d '{
"batting_team": "Mumbai Indians",
"bowling_team": "Chennai Super Kings",
...
}'
# Chat
curl -X POST http://localhost:8000/chat \
-H "Content-Type: application/json" \
-d '{"message": "Will MI beat CSK?"}'
Scaling Tips
- Use gunicorn for multiple workers:
gunicorn -w 4 -k uvicorn.workers.UvicornWorker app:app
- Enable caching for Q&A (results don't change):
from functools import lru_cache
@lru_cache(maxsize=10000)
def answer_question_cached(question):
return answer_question(question, ...)
-
Load balance across multiple servers:
- Send requests to different servers
- Each loads models independently
- Scales to 1000s of requests/sec
What's in Part 5 (Frontend + Testing)
Final post: Making it all visible and reliable:
✅ Streamlit frontend (3 tabs)
✅ Session state (conversation persistence)
✅ 22 ground-truth tests
✅ Deployment (Streamlit Cloud, Docker)
✅ Common pitfalls
Sneak preview: Your tests should pull data from CSV, not be hardcoded. Here's why it matters...
This is Part 4 of 5. Subscribe for the finale! 🏏
Top comments (0)