Learning Objectives
By the end of this module, you will:
- Master production-ready Bazel deployments
- Implement comprehensive CI/CD pipelines with Bazel
- Set up remote execution for scalable builds
- Deploy containerized Python applications using Bazel
- Monitor and maintain production Bazel systems
- Implement security best practices for production builds
6.1 Production Build Configuration
Environment-Specific Configurations
Create robust configuration management for different environments:
# .bazelrc.production
# Production build configuration
build:prod --compilation_mode=opt
build:prod --strip=always
build:prod --define=environment=production
build:prod --stamp=true
# Security hardening
build:prod --sandbox_default_allow_network=false
build:prod --experimental_strict_action_env=true
# Performance optimization
build:prod --jobs=auto
build:prod --experimental_worker_memory_limit_mb=4096
build:prod --experimental_worker_multiplex
# Remote caching for production
build:prod --remote_cache=https://prod-cache.company.com
build:prod --remote_upload_local_results=true
build:prod --remote_timeout=300
# Production-specific flags
build:prod --define=log_level=ERROR
build:prod --define=enable_debug=false
build:prod --define=enable_metrics=true
# .bazelrc.staging
build:staging --compilation_mode=opt
build:staging --define=environment=staging
build:staging --stamp=true
build:staging --remote_cache=https://staging-cache.company.com
build:staging --define=log_level=INFO
build:staging --define=enable_debug=true
build:staging --define=enable_metrics=true
# .bazelrc.development
build:dev --compilation_mode=fastbuild
build:dev --define=environment=development
build:dev --stamp=false
build:dev --spawn_strategy=local
build:dev --define=log_level=DEBUG
build:dev --define=enable_debug=true
build:dev --define=enable_metrics=false
Build Stamping for Production
Implement proper build stamping for version tracking:
# //tools/version.bzl
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
def _version_info_impl(ctx):
"""Generate version information from build stamp."""
version_file = ctx.outputs.version_file
# Read stamp information
if ctx.attr.stamp:
stamp_file = ctx.info_file
stable_status = ctx.version_file
else:
stamp_file = None
stable_status = None
# Generate version.py
ctx.actions.expand_template(
template = ctx.file.template,
output = version_file,
substitutions = {
"{BUILD_TIMESTAMP}": "$(BUILD_TIMESTAMP)",
"{BUILD_USER}": "$(BUILD_USER)",
"{BUILD_HOST}": "$(BUILD_HOST)",
"{GIT_COMMIT}": "$(BUILD_SCM_REVISION)",
"{GIT_STATUS}": "$(BUILD_SCM_STATUS)",
},
)
return [DefaultInfo(files = depset([version_file]))]
version_info = rule(
implementation = _version_info_impl,
attrs = {
"template": attr.label(
allow_single_file = True,
default = "//tools:version_template.py",
),
"stamp": attr.bool(default = False),
},
outputs = {"version_file": "%{name}.py"},
)
# //tools/version_template.py
"""Auto-generated version information."""
BUILD_TIMESTAMP = "{BUILD_TIMESTAMP}"
BUILD_USER = "{BUILD_USER}"
BUILD_HOST = "{BUILD_HOST}"
GIT_COMMIT = "{GIT_COMMIT}"
GIT_STATUS = "{GIT_STATUS}"
VERSION_INFO = {
"build_timestamp": BUILD_TIMESTAMP,
"build_user": BUILD_USER,
"build_host": BUILD_HOST,
"git_commit": GIT_COMMIT,
"git_status": GIT_STATUS,
}
def get_version():
"""Get formatted version string."""
return f"{GIT_COMMIT[:8]}-{BUILD_TIMESTAMP}"
def get_build_info():
"""Get complete build information."""
return VERSION_INFO
# //src/BUILD
load("//tools:version.bzl", "version_info")
version_info(
name = "version",
stamp = select({
"//conditions:prod": True,
"//conditions:staging": True,
"//conditions:default": False,
}),
)
py_library(
name = "app_version",
srcs = [":version"],
visibility = ["//visibility:public"],
)
6.2 Container-Based Deployments
Docker Integration with Bazel
Create production-ready Docker images using Bazel:
# //deploy/BUILD
load("@rules_docker//python3:image.bzl", "py3_image")
load("@rules_docker//container:container.bzl", "container_push", "container_image")
# Base Python application image
py3_image(
name = "app_base_image",
srcs = ["//src:app_lib"],
main = "//src:main.py",
deps = [
"//src:app_lib",
"@pypi//fastapi",
"@pypi//uvicorn",
"@pypi//pydantic",
],
# Use slim base image for production
base = "@python3_base//image",
)
# Production image with optimizations
container_image(
name = "app_prod_image",
base = ":app_base_image",
env = {
"ENVIRONMENT": "production",
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONDONTWRITEBYTECODE": "1",
},
ports = ["8000"],
workdir = "/app",
# Add health check
cmd = [
"python",
"-m", "uvicorn",
"src.main:app",
"--host", "0.0.0.0",
"--port", "8000",
"--workers", "4",
],
# Add labels for production
labels = {
"maintainer": "devops@company.com",
"version": "{BUILD_SCM_REVISION}",
"environment": "production",
},
)
# Multi-stage production image
container_image(
name = "app_optimized_image",
base = "@distroless_python3//image",
files = [
"//src:app_binary_files",
],
entrypoint = ["/app/main"],
env = {
"PYTHONPATH": "/app",
"ENVIRONMENT": "production",
},
# Security: run as non-root
user = "1000",
)
# Push to registry
container_push(
name = "push_prod",
image = ":app_prod_image",
registry = "gcr.io",
repository = "company-project/api-server",
tag = "{BUILD_SCM_REVISION}",
)
container_push(
name = "push_latest",
image = ":app_prod_image",
registry = "gcr.io",
repository = "company-project/api-server",
tag = "latest",
)
Kubernetes Deployment
Generate Kubernetes manifests with Bazel:
# //k8s/BUILD
load("@io_bazel_rules_k8s//k8s:objects.bzl", "k8s_objects")
load("@io_bazel_rules_k8s//k8s:object.bzl", "k8s_object")
# Production deployment
k8s_object(
name = "api_deployment_prod",
kind = "deployment",
template = ":deployment.yaml",
substitutions = {
"{IMAGE_TAG}": "{BUILD_SCM_REVISION}",
"{ENVIRONMENT}": "production",
"{REPLICAS}": "3",
"{CPU_REQUEST}": "100m",
"{CPU_LIMIT}": "500m",
"{MEMORY_REQUEST}": "128Mi",
"{MEMORY_LIMIT}": "512Mi",
},
)
k8s_object(
name = "api_service_prod",
kind = "service",
template = ":service.yaml",
substitutions = {
"{ENVIRONMENT}": "production",
},
)
k8s_object(
name = "api_configmap_prod",
kind = "configmap",
template = ":configmap.yaml",
substitutions = {
"{ENVIRONMENT}": "production",
"{LOG_LEVEL}": "INFO",
"{DATABASE_URL}": "$(DATABASE_URL_PROD)",
"{REDIS_URL}": "$(REDIS_URL_PROD)",
},
)
# Complete application stack
k8s_objects(
name = "api_stack_prod",
objects = [
":api_deployment_prod",
":api_service_prod",
":api_configmap_prod",
],
)
# Staging environment
k8s_objects(
name = "api_stack_staging",
objects = [
":api_deployment_staging",
":api_service_staging",
":api_configmap_staging",
],
)
# //k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-server-{ENVIRONMENT}
namespace: {ENVIRONMENT}
labels:
app: api-server
environment: {ENVIRONMENT}
version: {IMAGE_TAG}
spec:
replicas: {REPLICAS}
selector:
matchLabels:
app: api-server
environment: {ENVIRONMENT}
template:
metadata:
labels:
app: api-server
environment: {ENVIRONMENT}
version: {IMAGE_TAG}
spec:
containers:
- name: api-server
image: gcr.io/company-project/api-server:{IMAGE_TAG}
ports:
- containerPort: 8000
name: http
env:
- name: ENVIRONMENT
value: {ENVIRONMENT}
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: api-secrets
key: database-url
resources:
requests:
cpu: {CPU_REQUEST}
memory: {MEMORY_REQUEST}
limits:
cpu: {CPU_LIMIT}
memory: {MEMORY_LIMIT}
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
6.3 CI/CD Pipeline Implementation
GitHub Actions Integration
Complete CI/CD pipeline with Bazel:
# .github/workflows/ci.yml
name: CI/CD Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
env:
BAZEL_VERSION: 7.0.0
PYTHON_VERSION: 3.11
jobs:
build-and-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for build stamping
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Bazel
uses: bazel-contrib/setup-bazel@0.8.1
with:
bazelisk-version: 1.x
- name: Configure Bazel caching
run: |
echo "build --remote_cache=https://storage.googleapis.com/${{ secrets.GCS_CACHE_BUCKET }}" >> .bazelrc.ci
echo "build --google_default_credentials=true" >> .bazelrc.ci
echo "build --remote_upload_local_results=true" >> .bazelrc.ci
echo "build --disk_cache=/tmp/bazel-disk-cache" >> .bazelrc.ci
echo "build --repository_cache=/tmp/bazel-repo-cache" >> .bazelrc.ci
- name: Cache Bazel artifacts
uses: actions/cache@v3
with:
path: |
/tmp/bazel-disk-cache
/tmp/bazel-repo-cache
key: bazel-${{ runner.os }}-${{ hashFiles('WORKSPACE', 'MODULE.bazel', '**/*.bzl', 'requirements*.txt') }}
restore-keys: |
bazel-${{ runner.os }}-
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Build all targets
run: bazel build //... --config=ci
- name: Run unit tests
run: bazel test //... --test_tag_filters=-integration --config=ci --test_output=errors
- name: Run integration tests
run: bazel test //... --test_tag_filters=integration --config=ci --test_output=errors
- name: Run security scans
run: |
bazel run //tools/security:vulnerability_scan
bazel run //tools/security:dependency_check
- name: Generate coverage report
run: |
bazel coverage //... --combined_report=lcov --config=ci
bash <(curl -s https://codecov.io/bash) -f bazel-out/_coverage/_coverage_report.dat
- name: Build Docker images
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop'
run: |
bazel build //deploy:app_prod_image --config=ci --stamp
- name: Upload build artifacts
uses: actions/upload-artifact@v3
with:
name: bazel-build-artifacts
path: |
bazel-bin/
bazel-testlogs/
retention-days: 7
security-scan:
runs-on: ubuntu-latest
needs: build-and-test
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
with:
scan-type: 'fs'
scan-ref: '.'
format: 'sarif'
output: 'trivy-results.sarif'
- name: Upload Trivy scan results
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: 'trivy-results.sarif'
deploy-staging:
runs-on: ubuntu-latest
needs: [build-and-test, security-scan]
if: github.ref == 'refs/heads/develop'
environment: staging
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Bazel
uses: bazel-contrib/setup-bazel@0.8.1
with:
bazelisk-version: 1.x
- name: Authenticate to GCP
uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Build and push Docker image
run: |
bazel run //deploy:push_staging --config=ci --stamp
- name: Deploy to staging
run: |
bazel run //k8s:api_stack_staging.apply --config=ci
- name: Run smoke tests
run: |
bazel test //tests/smoke:staging_smoke_tests --config=ci
- name: Notify deployment
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
text: "Staging deployment completed: ${{ github.sha }}"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }}
deploy-production:
runs-on: ubuntu-latest
needs: [build-and-test, security-scan]
if: github.ref == 'refs/heads/main'
environment: production
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Bazel
uses: bazel-contrib/setup-bazel@0.8.1
with:
bazelisk-version: 1.x
- name: Authenticate to GCP
uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Build and push Docker image
run: |
bazel run //deploy:push_prod --config=ci --stamp
bazel run //deploy:push_latest --config=ci --stamp
- name: Deploy to production (blue-green)
run: |
# Deploy to green environment first
bazel run //k8s:api_stack_prod_green.apply --config=ci
# Run production smoke tests
bazel test //tests/smoke:production_smoke_tests --config=ci
# Switch traffic to green
bazel run //k8s:switch_to_green --config=ci
# Clean up blue environment
bazel run //k8s:cleanup_blue --config=ci
- name: Create GitHub release
uses: softprops/action-gh-release@v1
with:
tag_name: v${{ github.run_number }}
name: Release v${{ github.run_number }}
body: |
## Changes
${{ github.event.head_commit.message }}
## Build Info
- Commit: ${{ github.sha }}
- Build: ${{ github.run_number }}
- Environment: Production
draft: false
prerelease: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Notify production deployment
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
text: "π Production deployment completed: v${{ github.run_number }}"
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }}
GitLab CI Integration
Alternative CI/CD setup for GitLab:
# .gitlab-ci.yml
stages:
- build
- test
- security
- deploy-staging
- deploy-production
variables:
BAZEL_VERSION: "7.0.0"
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: "/certs"
.bazel_cache: &bazel_cache
cache:
key: bazel-$CI_COMMIT_REF_SLUG
paths:
- /tmp/bazel-cache/
- /tmp/bazel-repo-cache/
policy: pull-push
.bazel_setup: &bazel_setup
before_script:
- curl -LO "https://github.com/bazelbuild/bazelisk/releases/download/v1.19.0/bazelisk-linux-amd64"
- chmod +x bazelisk-linux-amd64
- mv bazelisk-linux-amd64 /usr/local/bin/bazel
- echo "build --disk_cache=/tmp/bazel-cache" >> .bazelrc.ci
- echo "build --repository_cache=/tmp/bazel-repo-cache" >> .bazelrc.ci
- echo "build --remote_cache=$REMOTE_CACHE_URL" >> .bazelrc.ci
- echo "build --remote_upload_local_results=true" >> .bazelrc.ci
build:
stage: build
image: ubuntu:22.04
<<: *bazel_cache
<<: *bazel_setup
script:
- apt-get update && apt-get install -y python3 python3-pip curl
- bazel build //... --config=ci
artifacts:
paths:
- bazel-bin/
expire_in: 1 hour
unit-tests:
stage: test
image: ubuntu:22.04
<<: *bazel_cache
<<: *bazel_setup
script:
- apt-get update && apt-get install -y python3 python3-pip curl
- bazel test //... --test_tag_filters=-integration --config=ci --test_output=errors
artifacts:
reports:
junit: bazel-testlogs/**/test.xml
paths:
- bazel-testlogs/
expire_in: 1 week
integration-tests:
stage: test
image: ubuntu:22.04
<<: *bazel_cache
<<: *bazel_setup
services:
- postgres:13
- redis:6
variables:
POSTGRES_DB: testdb
POSTGRES_USER: testuser
POSTGRES_PASSWORD: testpass
script:
- apt-get update && apt-get install -y python3 python3-pip curl postgresql-client
- bazel test //... --test_tag_filters=integration --config=ci --test_output=errors
artifacts:
reports:
junit: bazel-testlogs/**/test.xml
security-scan:
stage: security
image: aquasec/trivy:latest
script:
- trivy fs --format json --output security-report.json .
artifacts:
reports:
security: security-report.json
allow_failure: true
deploy-staging:
stage: deploy-staging
image: google/cloud-sdk:alpine
<<: *bazel_setup
environment:
name: staging
url: https://api-staging.company.com
only:
- develop
script:
- gcloud auth activate-service-account --key-file $GCP_SERVICE_ACCOUNT_KEY
- bazel run //deploy:push_staging --config=ci --stamp
- bazel run //k8s:api_stack_staging.apply --config=ci
- bazel test //tests/smoke:staging_smoke_tests --config=ci
deploy-production:
stage: deploy-production
image: google/cloud-sdk:alpine
<<: *bazel_setup
environment:
name: production
url: https://api.company.com
only:
- main
when: manual
script:
- gcloud auth activate-service-account --key-file $GCP_SERVICE_ACCOUNT_KEY
- bazel run //deploy:push_prod --config=ci --stamp
- bazel run //k8s:api_stack_prod.apply --config=ci
- bazel test //tests/smoke:production_smoke_tests --config=ci
6.4 Remote Execution Setup
Google Cloud Build Integration
Set up remote execution using Google Cloud Build:
# cloudbuild.yaml
steps:
# Build with remote execution
- name: 'l.gcr.io/google/bazel:latest'
args:
- 'build'
- '//...'
- '--remote_executor=grpc://remotebuildexecution.googleapis.com:443'
- '--remote_cache=grpc://remotebuildexecution.googleapis.com:443'
- '--bes_backend=grpc://buildeventservice.googleapis.com:443'
- '--bes_results_url=https://source.cloud.google.com/results/invocations/'
- '--project_id=$PROJECT_ID'
- '--jobs=50'
env:
- 'BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1'
# Run tests with remote execution
- name: 'l.gcr.io/google/bazel:latest'
args:
- 'test'
- '//...'
- '--remote_executor=grpc://remotebuildexecution.googleapis.com:443'
- '--remote_cache=grpc://remotebuildexecution.googleapis.com:443'
- '--test_output=errors'
- '--project_id=$PROJECT_ID'
- '--jobs=50'
env:
- 'BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1'
# Build configuration
options:
machineType: 'E2_HIGHCPU_32'
diskSizeGb: 100
Self-Hosted Remote Execution
Set up your own remote execution cluster:
# remote-execution/docker-compose.yml
version: '3.8'
services:
buildbarn-frontend:
image: buildbarn/bb-remote-execution:latest
command:
- /app/bb_frontend
- /config/frontend.jsonnet
ports:
- "8980:8980"
- "8981:8981"
volumes:
- ./config:/config:ro
depends_on:
- buildbarn-storage
buildbarn-scheduler:
image: buildbarn/bb-remote-execution:latest
command:
- /app/bb_scheduler
- /config/scheduler.jsonnet
ports:
- "8982:8982"
- "8983:8983"
volumes:
- ./config:/config:ro
depends_on:
- buildbarn-storage
buildbarn-worker:
image: buildbarn/bb-remote-execution:latest
command:
- /app/bb_worker
- /config/worker.jsonnet
volumes:
- ./config:/config:ro
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- buildbarn-scheduler
deploy:
replicas: 4
buildbarn-storage:
image: buildbarn/bb-storage:latest
command:
- /app/bb_storage
- /config/storage.jsonnet
ports:
- "8984:8984"
volumes:
- ./config:/config:ro
- bb-storage-data:/data
volumes:
bb-storage-data:
6.5 Monitoring and Observability
Build Analytics and Monitoring
Implement comprehensive build monitoring:
# //tools/monitoring/BUILD
py_binary(
name = "build_analytics",
srcs = ["build_analytics.py"],
deps = [
"@pypi//requests",
"@pypi//prometheus_client",
"@pypi//influxdb_client",
],
)
py_library(
name = "metrics_collector",
srcs = ["metrics_collector.py"],
deps = [
"@pypi//psutil",
"@pypi//prometheus_client",
],
)
# //tools/monitoring/build_analytics.py
"""Build analytics and monitoring."""
import json
import time
import requests
from prometheus_client import CollectorRegistry, Gauge, Counter, push_to_gateway
class BuildMetricsCollector:
def __init__(self, registry=None):
self.registry = registry or CollectorRegistry()
# Build metrics
self.build_duration = Gauge(
'bazel_build_duration_seconds',
'Duration of Bazel builds',
['target', 'config', 'result'],
registry=self.registry
)
self.cache_hit_rate = Gauge(
'bazel_cache_hit_rate',
'Cache hit rate percentage',
['cache_type'],
registry=self.registry
)
self.test_results = Counter(
'bazel_test_results_total',
'Test results by status',
['status', 'target'],
registry=self.registry
)
self.build_size = Gauge(
'bazel_build_size_bytes',
'Size of build outputs',
['target'],
registry=self.registry
)
def collect_build_metrics(self, build_event_file):
"""Collect metrics from Bazel build event stream."""
with open(build_event_file, 'r') as f:
for line in f:
event = json.loads(line)
self._process_build_event(event)
def _process_build_event(self, event):
"""Process individual build event."""
if 'buildFinished' in event:
result = event['buildFinished']
self.build_duration.labels(
target='//...',
config='prod',
result=result.get('exitCode', 'unknown')
).set(result.get('finishTimeMillis', 0) / 1000)
elif 'testResult' in event:
test = event['testResult']
self.test_results.labels(
status=test.get('status', 'unknown'),
target=test.get('label', 'unknown')
).inc()
def push_metrics(self, gateway_url, job_name):
"""Push metrics to Prometheus gateway."""
push_to_gateway(gateway_url, job=job_name, registry=self.registry)
# Usage example
if __name__ == "__main__":
collector = BuildMetricsCollector()
collector.collect_build_metrics('/tmp/build_events.json')
collector.push_metrics('http://prometheus-gateway:9091', 'bazel_builds')
Health Monitoring for Deployed Applications
# //src/monitoring/health.py
"""Application health monitoring."""
from fastapi import FastAPI, Response, status
from prometheus_client import Counter, Histogram, generate_latest
import time
import psutil
import logging
app = FastAPI()
# Metrics
request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status'])
request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration')
@app.middleware("http")
async def metrics_middleware(request, call_next):
start_time = time.time()
response = await call_next(request)
duration = time.time() - start_time
request_count.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
request_duration.observe(duration)
return response
@app.get("/health")
async def health_check():
"""Basic health check endpoint."""
return {
"status": "healthy",
"timestamp": time.time(),
"version": "1.0.0"
}
@app.get("/ready")
async def readiness_check():
"""Readiness check for Kubernetes."""
# Check database connectivity
try:
# Add your database health check here
# db.execute("SELECT 1")
pass
except Exception as e:
logging.error(f"Database health check failed: {e}")
return Response(status_code=status.HTTP_503_SERVICE_UNAVAILABLE)
# Check system resources
memory_usage = psutil.virtual_memory().percent
if memory_usage > 90:
return Response(status_code=status.HTTP_503_SERVICE_UNAVAILABLE)
return {"status": "ready", "memory_usage": memory_usage}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint."""
return Response(generate_latest(), media_type="text/plain")
@app.get("/info")
async def build_info():
"""Build and deployment information."""
from src.app_version import get_build_info
return get_build_info()
Alerting Configuration
# //monitoring/alerts/bazel-alerts.yml
groups:
- name: bazel_build_alerts
rules:
- alert: BuildFailureRate
expr: rate(bazel_build_failures_total[5m]) > 0.1
for: 2m
labels:
severity: warning
team: devops
annotations:
summary: "High build failure rate detected"
description: "Build failure rate is {{ $value }} failures per second"
- alert: CacheHitRateLow
expr: bazel_cache_hit_rate < 0.5
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Bazel cache hit rate is low"
description: "Cache hit rate is {{ $value }}%, below 50% threshold"
- alert: BuildDurationHigh
expr: bazel_build_duration_seconds > 1800
for: 1m
labels:
severity: critical
team: devops
annotations:
summary: "Build duration exceeded 30 minutes"
description: "Build took {{ $value }} seconds to complete"
- name: application_alerts
rules:
- alert: ApplicationDown
expr: up{job="api-server"} == 0
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Application is down"
description: "API server has been down for more than 1 minute"
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 2m
labels:
severity: warning
team: sre
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors per second"
6.6 Security Best Practices
Secure Build Configuration
# .bazelrc.security
# Security-focused build configuration
build:secure --sandbox_default_allow_network=false
build:secure --experimental_strict_action_env=true
build:secure --experimental_guard_against_concurrent_changes=true
# Disable potentially unsafe features
build:secure --noincompatible_strict_action_env
build:secure --experimental_sandbox_default_allow_network=false
# Use hermetic toolchains
build:secure --incompatible_enable_cc_toolchain_resolution
build:secure --action_env=PATH=/usr/bin:/bin
# Enable security features
build:secure --experimental_enable_runfiles_manifest
build:secure --experimental_inprocess_symlink_creation=false
Dependency Security Scanning
# //tools/security/BUILD
py_binary(
name = "vulnerability_scan",
srcs = ["vulnerability_scan.py"],
deps = [
"@pypi//safety",
"@pypi//requests",
"@pypi//packaging",
],
)
py_binary(
name = "dependency_check",
srcs = ["dependency_check.py"],
deps = [
"@pypi//cyclonedx_bom",
"@pypi//packageurl_python",
],
)
# //tools/security/vulnerability_scan.py
"""Security vulnerability scanning for Python dependencies."""
import json
import subprocess
import sys
from packaging.version import parse
import requests
class VulnerabilityScanner:
def __init__(self):
self.safety_db_url = "https://raw.githubusercontent.com/pyupio/safety-db/master/data"
def scan_requirements(self, requirements_file):
"""Scan requirements file for vulnerabilities."""
try:
result = subprocess.run([
"safety", "check",
"--requirements", requirements_file,
"--output", "json"
], capture_output=True, text=True)
if result.returncode != 0:
vulnerabilities = json.loads(result.stdout)
self._report_vulnerabilities(vulnerabilities)
return False
return True
except Exception as e:
print(f"Error scanning requirements: {e}")
return False
def _report_vulnerabilities(self, vulnerabilities):
"""Report found vulnerabilities."""
print("π¨ SECURITY VULNERABILITIES FOUND:")
print("=" * 50)
for vuln in vulnerabilities:
print(f"Package: {vuln['package_name']}")
print(f"Installed: {vuln['installed_version']}")
print(f"Vulnerability: {vuln['vulnerability_id']}")
print(f"Description: {vuln['advisory']}")
print(f"More info: {vuln['more_info_url']}")
print("-" * 30)
def generate_sbom(self, output_file):
"""Generate Software Bill of Materials."""
try:
subprocess.run([
"cyclonedx-bom",
"--requirements", "requirements.txt",
"--output", output_file
], check=True)
print(f"SBOM generated: {output_file}")
return True
except subprocess.CalledProcessError:
print("Failed to generate SBOM")
return False
if __name__ == "__main__":
scanner = VulnerabilityScanner()
# Scan all requirements files
requirements_files = [
"//third_party:requirements.txt",
"//third_party:requirements-dev.txt",
"//third_party:requirements-test.txt"
]
all_secure = True
for req_file in requirements_files:
if not scanner.scan_requirements(req_file):
all_secure = False
# Generate SBOM
scanner.generate_sbom("sbom.json")
if not all_secure:
sys.exit(1)
Secret Management
# //tools/secrets/BUILD
py_library(
name = "secret_manager",
srcs = ["secret_manager.py"],
deps = [
"@pypi//google_cloud_secret_manager",
"@pypi//azure_keyvault_secrets",
"@pypi//boto3",
],
)
# //tools/secrets/secret_manager.py
"""Centralized secret management."""
import os
from abc import ABC, abstractmethod
from google.cloud import secretmanager
import boto3
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
class SecretProvider(ABC):
@abstractmethod
def get_secret(self, secret_name: str) -> str:
pass
class GCPSecretProvider(SecretProvider):
def __init__(self, project_id: str):
self.client = secretmanager.SecretManagerServiceClient()
self.project_id = project_id
def get_secret(self, secret_name: str) -> str:
name = f"projects/{self.project_id}/secrets/{secret_name}/versions/latest"
response = self.client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
class AWSSecretProvider(SecretProvider):
def __init__(self, region: str):
self.client = boto3.client('secretsmanager', region_name=region)
def get_secret(self, secret_name: str) -> str:
response = self.client.get_secret_value(SecretId=secret_name)
return response['SecretString']
class AzureSecretProvider(SecretProvider):
def __init__(self, vault_url: str):
credential = DefaultAzureCredential()
self.client = SecretClient(vault_url=vault_url, credential=credential)
def get_secret(self, secret_name: str) -> str:
secret = self.client.get_secret(secret_name)
return secret.value
class SecretManager:
def __init__(self):
self.provider = self._get_provider()
def _get_provider(self) -> SecretProvider:
"""Select secret provider based on environment."""
if os.getenv('GCP_PROJECT'):
return GCPSecretProvider(os.getenv('GCP_PROJECT'))
elif os.getenv('AWS_REGION'):
return AWSSecretProvider(os.getenv('AWS_REGION'))
elif os.getenv('AZURE_VAULT_URL'):
return AzureSecretProvider(os.getenv('AZURE_VAULT_URL'))
else:
raise ValueError("No secret provider configured")
def get_secret(self, secret_name: str) -> str:
return self.provider.get_secret(secret_name)
6.7 Production Troubleshooting
Debugging Production Issues
# //tools/debug/BUILD
py_binary(
name = "production_debugger",
srcs = ["production_debugger.py"],
deps = [
"@pypi//kubernetes",
"@pypi//requests",
"@pypi//click",
],
)
# //tools/debug/production_debugger.py
"""Production debugging utilities."""
import click
import subprocess
import json
from kubernetes import client, config
class ProductionDebugger:
def __init__(self):
try:
config.load_incluster_config()
except:
config.load_kube_config()
self.k8s = client.CoreV1Api()
def check_build_artifacts(self, commit_sha: str):
"""Check if build artifacts exist for commit."""
print(f"Checking build artifacts for {commit_sha}...")
# Check if Docker image exists
image_tag = f"gcr.io/company-project/api-server:{commit_sha}"
try:
result = subprocess.run([
"docker", "manifest", "inspect", image_tag
], capture_output=True, text=True)
if result.returncode == 0:
print(f"β
Docker image exists: {image_tag}")
manifest = json.loads(result.stdout)
print(f" Size: {manifest.get('schemaVersion', 'unknown')}")
else:
print(f"β Docker image not found: {image_tag}")
except Exception as e:
print(f"Error checking Docker image: {e}")
def check_deployment_status(self, namespace: str, deployment: str):
"""Check Kubernetes deployment status."""
try:
deployment_obj = client.AppsV1Api().read_namespaced_deployment(
name=deployment, namespace=namespace
)
print(f"Deployment: {deployment}")
print(f"Replicas: {deployment_obj.status.replicas}")
print(f"Ready: {deployment_obj.status.ready_replicas}")
print(f"Available: {deployment_obj.status.available_replicas}")
# Check pod status
pods = self.k8s.list_namespaced_pod(
namespace=namespace,
label_selector=f"app={deployment}"
)
for pod in pods.items:
print(f"Pod: {pod.metadata.name}")
print(f" Status: {pod.status.phase}")
print(f" Ready: {sum(1 for c in pod.status.conditions if c.type == 'Ready' and c.status == 'True')}")
except Exception as e:
print(f"Error checking deployment: {e}")
def get_recent_logs(self, namespace: str, deployment: str, lines: int = 100):
"""Get recent application logs."""
try:
pods = self.k8s.list_namespaced_pod(
namespace=namespace,
label_selector=f"app={deployment}"
)
for pod in pods.items:
print(f"\n=== Logs for {pod.metadata.name} ===")
logs = self.k8s.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=namespace,
tail_lines=lines
)
print(logs)
except Exception as e:
print(f"Error getting logs: {e}")
@click.group()
def cli():
"""Production debugging CLI."""
pass
@cli.command()
@click.argument('commit_sha')
def check_build(commit_sha):
"""Check build artifacts for commit."""
debugger = ProductionDebugger()
debugger.check_build_artifacts(commit_sha)
@cli.command()
@click.argument('namespace')
@click.argument('deployment')
def check_deployment(namespace, deployment):
"""Check deployment status."""
debugger = ProductionDebugger()
debugger.check_deployment_status(namespace, deployment)
@cli.command()
@click.argument('namespace')
@click.argument('deployment')
@click.option('--lines', default=100, help='Number of log lines to retrieve')
def logs(namespace, deployment, lines):
"""Get application logs."""
debugger = ProductionDebugger()
debugger.get_recent_logs(namespace, deployment, lines)
if __name__ == "__main__":
cli()
Performance Debugging
#!/bin/bash
# //scripts/performance_debug.sh
echo "π Production Performance Debug Report"
echo "======================================"
# Build performance analysis
echo "π Recent Build Performance:"
bazel query "//..." --output=build | wc -l
echo "Total targets: $(bazel query '//...' | wc -l)"
# Cache analysis
echo "πΎ Cache Performance:"
echo "Local cache size: $(du -sh ~/.cache/bazel 2>/dev/null || echo 'Unknown')"
# Resource usage
echo "π₯οΈ System Resources:"
echo "CPU usage: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)%"
echo "Memory usage: $(free | grep Mem | awk '{printf "%.1f%%", $3/$2 * 100.0}')"
echo "Disk usage: $(df -h | grep '/dev/' | head -1 | awk '{print $5}')"
# Application metrics
if command -v kubectl &> /dev/null; then
echo "π Kubernetes Resources:"
kubectl top nodes 2>/dev/null || echo "kubectl not configured"
kubectl get pods -A --field-selector=status.phase=Running | wc -l | xargs echo "Running pods:"
fi
echo "β
Debug report complete"
6.8 Rollback and Recovery Procedures
Automated Rollback System
# //tools/rollback/BUILD
py_binary(
name = "rollback_manager",
srcs = ["rollback_manager.py"],
deps = [
"@pypi//kubernetes",
"@pypi//google_cloud_container",
"@pypi//click",
],
)
# //tools/rollback/rollback_manager.py
"""Automated rollback and recovery system."""
import click
import time
import logging
from kubernetes import client, config
from google.cloud import container_v1
class RollbackManager:
def __init__(self):
config.load_kube_config()
self.k8s_apps = client.AppsV1Api()
self.k8s_core = client.CoreV1Api()
def rollback_deployment(self, namespace: str, deployment: str, revision: int = None):
"""Rollback Kubernetes deployment."""
try:
if revision:
# Rollback to specific revision
body = {'spec': {'rollbackTo': {'revision': revision}}}
else:
# Rollback to previous version
body = {'spec': {'rollbackTo': {}}}
self.k8s_apps.patch_namespaced_deployment_rollback(
name=deployment,
namespace=namespace,
body=body
)
print(f"Initiated rollback for {deployment} in {namespace}")
# Wait for rollback to complete
self._wait_for_rollback_completion(namespace, deployment)
except Exception as e:
print(f"Rollback failed: {e}")
raise
def _wait_for_rollback_completion(self, namespace: str, deployment: str, timeout: int = 300):
"""Wait for rollback to complete."""
start_time = time.time()
while time.time() - start_time < timeout:
deployment_obj = self.k8s_apps.read_namespaced_deployment(
name=deployment, namespace=namespace
)
if (deployment_obj.status.ready_replicas == deployment_obj.spec.replicas and
deployment_obj.status.updated_replicas == deployment_obj.spec.replicas):
print(f"β
Rollback completed for {deployment}")
return True
print(f"β³ Waiting for rollback... ({int(time.time() - start_time)}s)")
time.sleep(10)
raise TimeoutError(f"Rollback timeout after {timeout}s")
def health_check_after_rollback(self, namespace: str, deployment: str):
"""Perform health checks after rollback."""
# Get service endpoint
try:
service = self.k8s_core.read_namespaced_service(
name=deployment, namespace=namespace
)
# Perform basic health check
import requests
health_url = f"http://{service.spec.cluster_ip}/health"
for attempt in range(5):
try:
response = requests.get(health_url, timeout=10)
if response.status_code == 200:
print("β
Health check passed")
return True
except Exception as e:
print(f"Health check attempt {attempt + 1} failed: {e}")
time.sleep(5)
print("β Health check failed")
return False
except Exception as e:
print(f"Error during health check: {e}")
return False
@click.group()
def cli():
"""Rollback management CLI."""
pass
@cli.command()
@click.argument('namespace')
@click.argument('deployment')
@click.option('--revision', type=int, help='Specific revision to rollback to')
@click.option('--confirm', is_flag=True, help='Skip confirmation prompt')
def rollback(namespace, deployment, revision, confirm):
"""Rollback deployment to previous version."""
if not confirm:
click.confirm(f'Are you sure you want to rollback {deployment} in {namespace}?', abort=True)
manager = RollbackManager()
manager.rollback_deployment(namespace, deployment, revision)
manager.health_check_after_rollback(namespace, deployment)
@cli.command()
@click.argument('namespace')
@click.argument('deployment')
def history(namespace, deployment):
"""Show deployment rollout history."""
try:
config.load_kube_config()
k8s = client.AppsV1Api()
# Get replica sets for deployment
replica_sets = k8s.list_namespaced_replica_set(
namespace=namespace,
label_selector=f"app={deployment}"
)
print(f"Rollout history for {deployment}:")
for rs in replica_sets.items:
revision = rs.metadata.annotations.get('deployment.kubernetes.io/revision', 'unknown')
created = rs.metadata.creation_timestamp
print(f" Revision {revision}: {created}")
except Exception as e:
print(f"Error getting history: {e}")
if __name__ == "__main__":
cli()
6.9 Best Practices Summary
Production Deployment Checklist
# Production Deployment Checklist
## Pre-Deployment
- [ ] All tests passing (unit, integration, e2e)
- [ ] Security scans completed with no critical issues
- [ ] Performance benchmarks within acceptable range
- [ ] Database migrations tested and ready
- [ ] Rollback plan documented and tested
- [ ] Monitoring and alerting configured
- [ ] Load testing completed
## During Deployment
- [ ] Blue-green deployment strategy used
- [ ] Health checks passing before traffic switch
- [ ] Monitoring dashboards active
- [ ] Team notified of deployment start
- [ ] Gradual traffic ramp-up if applicable
## Post-Deployment
- [ ] All health checks passing
- [ ] Error rates within normal bounds
- [ ] Performance metrics stable
- [ ] User acceptance testing completed
- [ ] Rollback plan ready if needed
- [ ] Documentation updated
- [ ] Team notified of successful deployment
Performance Optimization Guidelines
-
Build Performance
- Use remote caching and execution
- Optimize target granularity
- Monitor cache hit rates
- Use incremental builds effectively
-
Application Performance
- Implement proper monitoring
- Use performance profiling
- Optimize Docker images
- Configure resource limits appropriately
-
Infrastructure Performance
- Use appropriate machine types
- Configure autoscaling
- Monitor resource utilization
- Implement proper load balancing
Module 6 Exercises
Exercise 1: Complete CI/CD Pipeline
Set up a complete CI/CD pipeline for a Python web application including automated testing, security scanning, and deployment to staging and production environments.
Exercise 2: Monitoring Implementation
Implement comprehensive monitoring for your Bazel builds and deployed applications, including custom metrics, alerting rules, and dashboards.
Exercise 3: Disaster Recovery Testing
Create and test a complete disaster recovery procedure including automated rollbacks, data backup verification, and service restoration.
Exercise 4: Security Hardening
Implement security best practices including vulnerability scanning, secret management, and secure build configurations.
Course Completion Project
Final Project: Production-Ready Microservices Platform
Build a complete microservices platform demonstrating all concepts learned:
- Architecture: Multiple Python microservices with shared libraries
- Build System: Advanced Bazel configuration with custom rules
- Dependencies: Complex dependency management with version pinning
- Caching: Multi-tier caching strategy with remote execution
- CI/CD: Complete pipeline with automated testing and deployment
- Monitoring: Comprehensive observability and alerting
- Security: Vulnerability scanning and secret management
- Production: Blue-green deployments with automated rollback
This project should demonstrate expert-level Bazel usage that would impress any technical reviewer and showcase production-ready skills.
Key Takeaways
- Production deployments require comprehensive automation and monitoring
- Security must be built into every stage of the pipeline
- Observability is crucial for maintaining production systems
- Rollback capabilities are as important as deployment capabilities
- Performance optimization is an ongoing process requiring measurement
- Documentation and runbooks are essential for team success
- Testing in production-like environments prevents deployment issues
Congratulations! You've completed the Modern Bazel with Python tutorial.
Top comments (0)