0) Repo layout
profile-service/
├─ terraform/
│ ├─ providers.tf
│ ├─ variables.tf
│ ├─ vpc.tf
│ ├─ eks.tf
│ ├─ iam-irsa.tf
│ ├─ dynamodb.tf
│ ├─ ecr.tf
│ ├─ outputs.tf
├─ k8s/
│ ├─ namespace.yaml
│ ├─ deployment.yaml
│ ├─ service.yaml
│ ├─ ingress.yaml
├─ app/
│ ├─ app.py
│ ├─ requirements.txt
│ ├─ Dockerfile
├─ cicd/
│ └─ github-actions.yaml
├─ scripts/
│ └─ validate_config.py
├─ observability/
│ ├─ kube-prometheus-stack-values.yaml
│ └─ slo-alerts.yaml
└─ README.md
1) Terraform (IaC)
terraform/providers.tf
terraform {
required_version = ">= 1.6.0"
required_providers {
aws = { source = "hashicorp/aws", version = "~> 5.0" }
kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.29" }
helm = { source = "hashicorp/helm", version = "~> 2.12" }
}
}
provider "aws" {
region = "us-east-2"
}
# These two providers will be configured after EKS is created (via outputs)
provider "kubernetes" {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
}
data "aws_caller_identity" "current" {}
terraform/variables.tf
variable "project" { default = "profile-service" }
variable "vpc_cidr" { default = "10.30.0.0/16" }
variable "private_subnets" { default = ["10.30.1.0/24","10.30.2.0/24"] }
variable "public_subnets" { default = ["10.30.10.0/24","10.30.11.0/24"] }
terraform/vpc.tf
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = { Name = "${var.project}-vpc" }
}
resource "aws_internet_gateway" "igw" {
vpc_id = aws_vpc.main.id
}
resource "aws_subnet" "public" {
for_each = toset(var.public_subnets)
vpc_id = aws_vpc.main.id
cidr_block = each.value
map_public_ip_on_launch = true
tags = { Name = "${var.project}-public-${each.key}", "kubernetes.io/role/elb" = "1" }
}
resource "aws_subnet" "private" {
for_each = toset(var.private_subnets)
vpc_id = aws_vpc.main.id
cidr_block = each.value
tags = { Name = "${var.project}-private-${each.key}", "kubernetes.io/role/internal-elb" = "1" }
}
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
}
resource "aws_route" "default_inet" {
route_table_id = aws_route_table.public.id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.igw.id
}
resource "aws_route_table_association" "public_assoc" {
for_each = aws_subnet.public
route_table_id = aws_route_table.public.id
subnet_id = each.value.id
}
terraform/eks.tf
resource "aws_eks_cluster" "this" {
name = "${var.project}-eks"
role_arn = aws_iam_role.eks_cluster.arn
vpc_config {
subnet_ids = concat([for s in aws_subnet.private : s.id], [for s in aws_subnet.public : s.id])
}
depends_on = [aws_iam_role_policy_attachment.eks_cluster_AmazonEKSClusterPolicy]
}
resource "aws_iam_role" "eks_cluster" {
name = "${var.project}-eks-cluster-role"
assume_role_policy = data.aws_iam_policy_document.eks_trust.json
}
data "aws_iam_policy_document" "eks_trust" {
statement {
actions = ["sts:AssumeRole"]
principals { type = "Service", identifiers = ["eks.amazonaws.com"] }
}
}
resource "aws_iam_role_policy_attachment" "eks_cluster_AmazonEKSClusterPolicy" {
role = aws_iam_role.eks_cluster.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
}
resource "aws_eks_node_group" "ng" {
cluster_name = aws_eks_cluster.this.name
node_group_name = "${var.project}-ng"
node_role_arn = aws_iam_role.eks_node.arn
subnet_ids = [for s in aws_subnet.private : s.id]
scaling_config { desired_size = 2, min_size = 2, max_size = 4 }
instance_types = ["t3.large"]
depends_on = [aws_iam_role_policy_attachment.eks_worker_AmazonEKSWorkerNodePolicy]
}
resource "aws_iam_role" "eks_node" {
name = "${var.project}-eks-node-role"
assume_role_policy = data.aws_iam_policy_document.eks_nodes_trust.json
}
data "aws_iam_policy_document" "eks_nodes_trust" {
statement {
actions = ["sts:AssumeRole"]
principals { type="Service", identifiers=["ec2.amazonaws.com"] }
}
}
resource "aws_iam_role_policy_attachment" "eks_worker_AmazonEKSWorkerNodePolicy" {
role = aws_iam_role.eks_node.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
}
resource "aws_iam_role_policy_attachment" "eks_worker_AmazonEC2ContainerRegistryReadOnly" {
role = aws_iam_role.eks_node.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
}
resource "aws_iam_role_policy_attachment" "eks_worker_AmazonEKS_CNI_Policy" {
role = aws_iam_role.eks_node.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
}
data "aws_eks_cluster" "cluster" { name = aws_eks_cluster.this.name }
data "aws_eks_cluster_auth" "cluster" { name = aws_eks_cluster.this.name }
terraform/iam-irsa.tf (IRSA for DynamoDB access)
resource "aws_iam_role" "irsa_role" {
name = "${var.project}-irsa-dynamodb"
assume_role_policy = jsonencode({
Version = "2012-10-17",
Statement = [{
Effect = "Allow",
Principal = { Federated = aws_iam_openid_connect_provider.eks.arn },
Action = "sts:AssumeRoleWithWebIdentity",
Condition = {
StringEquals = {
"${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:sub" = "system:serviceaccount:profile:app-sa"
}
}
}]
})
}
resource "aws_iam_policy" "dynamo_rw" {
name = "${var.project}-dynamo-rw"
policy = jsonencode({
Version = "2012-10-17",
Statement = [{
Effect = "Allow",
Action = ["dynamodb:*"],
Resource = [aws_dynamodb_table.profiles.arn]
}]
})
}
resource "aws_iam_role_policy_attachment" "attach_rw" {
role = aws_iam_role.irsa_role.name
policy_arn = aws_iam_policy.dynamo_rw.arn
}
resource "aws_iam_openid_connect_provider" "eks" {
client_id_list = ["sts.amazonaws.com"]
thumbprint_list = ["9e99a48a9960b14926bb7f3b02e22da0afd10df6"] # (AWS published for OIDC, OK for demo)
url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer
}
terraform/dynamodb.tf
resource "aws_dynamodb_table" "profiles" {
name = "${var.project}-profiles"
billing_mode = "PAY_PER_REQUEST"
hash_key = "userId"
attribute { name = "userId"; type = "S" }
server_side_encryption { enabled = true } # KMS-managed
point_in_time_recovery { enabled = true } # backups/DR
tags = { Environment = "prod", Project = var.project }
}
terraform/ecr.tf
resource "aws_ecr_repository" "app" {
name = "${var.project}-api"
image_scanning_configuration { scan_on_push = true }
encryption_configuration { encryption_type = "AES256" }
}
terraform/outputs.tf
output "cluster_name" { value = aws_eks_cluster.this.name }
output "ecr_repo_url" { value = aws_ecr_repository.app.repository_url }
output "dynamodb_table" { value = aws_dynamodb_table.profiles.name }
Apply steps
cd terraform terraform init terraform apply -auto-approve aws eks update-kubeconfig --region us-east-2 --name profile-service-eksInstall AWS Load Balancer Controller via Helm (creates ALBs)
(afterterraform apply)helm repo add eks https://aws.github.io/eks-charts kubectl create namespace kube-system --dry-run=client -o yaml | kubectl apply -f - helm upgrade --install aws-load-balancer-controller eks/aws-load-balancer-controller \ -n kube-system \ --set clusterName=profile-service-eks \ --set serviceAccount.create=true \ --set region=us-east-2 \ --set vpcId=$(aws eks describe-cluster --name profile-service-eks --region us-east-2 --query "cluster.resourcesVpcConfig.vpcId" --output text)
2) App (Flask API)
app/app.py
from flask import Flask, request, jsonify
import boto3, os
app = Flask(__name__)
TABLE = os.getenv("TABLE_NAME")
REGION = os.getenv("AWS_REGION", "us-east-2")
dynamo = boto3.resource("dynamodb", region_name=REGION)
table = dynamo.Table(TABLE)
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/profile")
def upsert_profile():
body = request.get_json()
if not body or "userId" not in body:
return {"error": "userId required"}, 400
table.put_item(Item=body)
return {"ok": True, "userId": body["userId"]}
@app.get("/profile/<user_id>")
def get_profile(user_id):
resp = table.get_item(Key={"userId": user_id})
return jsonify(resp.get("Item") or {}), 200
if __name__ == "__main__":
app.run("0.0.0.0", 8080)
app/requirements.txt
flask==3.0.0
boto3==1.34.0
gunicorn==21.2.0
app/Dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py .
EXPOSE 8080
CMD ["gunicorn", "--bind", "0.0.0.0:8080", "app:app"]
3) Kubernetes Manifests (IRSA + ALB Ingress)
k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: profile
k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: profile-api
namespace: profile
spec:
replicas: 2
selector: { matchLabels: { app: profile-api } }
template:
metadata: { labels: { app: profile-api } }
spec:
serviceAccountName: app-sa
containers:
- name: api
image: REPLACE_WITH_ECR_URL:latest
ports: [{containerPort: 8080}]
env:
- name: TABLE_NAME
value: "profile-service-profiles"
- name: AWS_REGION
value: "us-east-2"
resources:
requests: { cpu: "200m", memory: "256Mi" }
limits: { cpu: "500m", memory: "512Mi" }
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: app-sa
namespace: profile
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::REPLACE_ACCOUNT_ID:role/profile-service-irsa-dynamodb
Replace
REPLACE_WITH_ECR_URLandREPLACE_ACCOUNT_IDwith your values (terraform output).
k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
name: profile-api-svc
namespace: profile
spec:
type: ClusterIP
selector: { app: profile-api }
ports:
- port: 80
targetPort: 8080
k8s/ingress.yaml (public HTTPS via ALB)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: profile-api-ing
namespace: profile
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
# For simple demo, HTTP. For real prod, attach ACM cert & force HTTPS.
spec:
rules:
- http:
paths:
- path: /health
pathType: Prefix
backend:
service:
name: profile-api-svc
port:
number: 80
- path: /profile
pathType: Prefix
backend:
service:
name: profile-api-svc
port:
number: 80
Deploy:
kubectl apply -f k8s/namespace.yaml kubectl apply -f k8s/deployment.yaml kubectl apply -f k8s/service.yaml kubectl apply -f k8s/ingress.yamlFind the ALB DNS:
kubectl get ingress -n profile
4) CI/CD (GitHub Actions)
cicd/github-actions.yaml
name: ci-cd
on:
push:
branches: [ "main" ]
env:
AWS_REGION: us-east-2
ECR_REPO: ${{ secrets.ECR_REPO }} # set in repo secrets
CLUSTER: profile-service-eks
jobs:
build-deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Validate Config (Python)
- run: python scripts/validate_config.py
- name: Configure AWS
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Login to ECR
id: ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Build & Push Image
run: |
IMAGE_URI=${{ steps.ecr.outputs.registry }}/${{ env.ECR_REPO }}:latest
docker build -t $IMAGE_URI ./app
docker push $IMAGE_URI
echo "IMAGE_URI=$IMAGE_URI" >> $GITHUB_ENV
- name: Update K8s image
run: |
aws eks update-kubeconfig --name $CLUSTER --region $AWS_REGION
sed -i "s|REPLACE_WITH_ECR_URL|${IMAGE_URI}|g" k8s/deployment.yaml
kubectl apply -f k8s/deployment.yaml
scripts/validate_config.py
import sys, json, os
# simple check (expand as needed)
required = ["AWS_REGION"]
missing = [k for k in required if not os.getenv(k)]
if missing:
print(f"Missing env vars: {missing}")
sys.exit(1) # Fail pipeline
print("Config looks good.")
Pipeline behavior if script fails: exits non-zero → job fails → deployment stops.
5) Observability (Prometheus/Grafana + SLO Alerts)
Install kube-prometheus-stack (once)
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm upgrade --install kube-stack prometheus-community/kube-prometheus-stack \
-n monitoring --create-namespace \
-f observability/kube-prometheus-stack-values.yaml
observability/kube-prometheus-stack-values.yaml (minimal)
grafana:
adminPassword: "admin"
service:
type: LoadBalancer
prometheus:
prometheusSpec:
retention: 7d
observability/slo-alerts.yaml (example alert on 5xx rate)
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: slo-alerts
namespace: monitoring
spec:
groups:
- name: api-availability
rules:
- alert: HighErrorRate
expr: sum(rate(container_cpu_usage_seconds_total{pod=~"profile-api.*"}[5m])) > 1
for: 10m
labels: { severity: warning }
annotations:
summary: "High error or CPU indicating potential SLI breach"
description: "Investigate app logs / throttling / DB."
Interview line:
“SLI = success/latency, SLO = 99.9% success, SLA = 99.5% public. Alerts watch error/latency against SLO.”
6) Encryption
- In transit: Put the ALB behind HTTPS (attach ACM cert to the Ingress/ALB; in demo we used HTTP for speed).
- At rest: DynamoDB KMS enabled (already in Terraform).
- Secrets: Prefer AWS Secrets Manager/SSM Parameter Store with IRSA.
Interview line: “TLS to ALB/mTLS if zero-trust; KMS at-rest; no plaintext secrets.”
7) DR (optional stretch)
- Turn the DynamoDB table into Global Table (add region
us-east-1), and put Route 53 latency routing in front of two regional ALBs. - Result: Active-Active. If us-east-2 fails, traffic flows to us-east-1.
8) Quick test
After CI/CD deploys:
ING=$(kubectl get ing -n profile -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}')
curl http://$ING/health
curl -X POST http://$ING/profile -H "Content-Type: application/json" -d '{"userId":"123","name":"Aisalkyn"}'
curl http://$ING/profile/123
9) Troubleshooting runbook (read this fast in interviews)
-
Can’t reach DB:
nslookup,nc -zv host port, check IRSA role, VPC Flow Logs. -
Pods CrashLoop:
kubectl logs -n profile deploy/profile-api, check env vars. - Ingress not coming up: check ALB controller logs, subnets tags, security groups.
- Throttling: DynamoDB metrics; adjust access patterns or add GSI.
10) Mini Q&A (mapped to this project)
- DevOps vs SRE? DevOps = delivery & automation; SRE = reliability (SLI/SLO/SLA, incident response, error budgets).
- DR for Netflix-scale? Active-Active multi-region, Global Tables, Route 53/GGA, automated failover.
- DynamoDB active-active? Yes, Global Tables (multi-region read/write).
- Encryption? TLS/mTLS in transit, KMS at rest, Secrets Manager, IRSA.
- If pipeline script fails? Non-zero exit → pipeline fails → no deploy → logs/alerts → fix → rerun.
- External service to EKS app? ALB/Ingress; VPC routing or TGW/peering; SG-to-SG rules; Route 53 DNS.
What to say in your summary (closing line)
“I built a small but production-style stack in us-east-2 with Terraform: EKS, DynamoDB, IRSA, ALB Ingress, ECR, CI/CD with Python validation, Prometheus/Grafana observability, encryption in transit/at rest, and an optional multi-region DR extension. I can demo deploys, run tests, and walk through troubleshooting and SRE practices end-to-end.”
2nd part:
1) HTTPS with ACM on the ALB Ingress
1.1 Request an ACM cert (in us-east-2)
aws acm request-certificate \
--domain-name api.example.com \
--validation-method DNS \
--region us-east-2
Get the validation CNAME from:
aws acm list-certificates --region us-east-2
aws acm describe-certificate --certificate-arn <CERT_ARN> --region us-east-2
Create that CNAME in Route 53 (see 2.2). When ACM shows ISSUED, proceed.
1.2 Update Ingress for HTTPS + redirect
Replace <CERT_ARN> and keep your namespace/service names:
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: profile-api-ing
namespace: profile
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80,"HTTPS":443}]'
alb.ingress.kubernetes.io/certificate-arn: <CERT_ARN>
alb.ingress.kubernetes.io/ssl-redirect: '443'
spec:
rules:
- host: api.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: profile-api-svc
port:
number: 80
Apply:
kubectl apply -f k8s/ingress.yaml
2) Route 53 DNS → ALB
2.1 Get ALB hostname created by the Ingress
kubectl get ingress profile-api-ing -n profile -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'; echo
# example: k8s-profile-...us-east-2.elb.amazonaws.com
2.2 Create Route 53 record
Replace <HOSTED_ZONE_ID> and domain:
cat > r53.json <<'JSON'
{
"Comment": "api.example.com → ALB",
"Changes": [{
"Action": "UPSERT",
"ResourceRecordSet": {
"Name": "api.example.com",
"Type": "CNAME",
"TTL": 60,
"ResourceRecords": [{ "Value": "ALB_HOSTNAME_HERE" }]
}
}]
}
JSON
aws route53 change-resource-record-sets \
--hosted-zone-id <HOSTED_ZONE_ID> \
--change-batch file://r53.json
When ACM validation CNAME is also present (from 1.1), HTTPS will be valid for https://api.example.com.
3) DynamoDB Global Tables (Active–Active)
Add us-east-1 as a replica of your table provisioned in us-east-2.
3.1 Terraform (preferred)
Update terraform/dynamodb.tf:
resource "aws_dynamodb_table" "profiles" {
name = "${var.project}-profiles"
billing_mode = "PAY_PER_REQUEST"
hash_key = "userId"
attribute { name = "userId"; type = "S" }
server_side_encryption { enabled = true }
point_in_time_recovery { enabled = true }
# Global Table replicas (v2 style)
replica {
region_name = "us-east-1"
}
tags = { Environment = "prod", Project = var.project }
}
# Provider alias for us-east-1 required for replicas
provider "aws" {
alias = "use1"
region = "us-east-1"
}
Also tell Terraform that DynamoDB may use the alias provider:
# In dynamodb.tf (top) or providers.tf
# Ensures the provider alias exists; some modules need explicit link
Re-apply:
terraform apply -auto-approve
Terraform will convert your table to a Global Table (v2) with replicas in us-east-1.
3.2 App notes (nothing to change usually)
- Your app keeps using the regional endpoint via the standard AWS SDK.
- If you deploy the app in both regions, set
AWS_REGIONaccordingly in each Deployment. - Conflict resolution: DynamoDB uses last-writer-wins; design idempotent writes for safety.
4) Optional: Route 53 latency routing for multi-region ALBs
If you deploy the same app in us-east-2 and us-east-1 (two EKS clusters and two Ingress/ALBs), create two records and a Latency policy:
{
"Comment": "Latency-based routing for api.example.com",
"Changes": [
{
"Action": "UPSERT",
"ResourceRecordSet": {
"Name": "api.example.com",
"Type": "A",
"SetIdentifier": "use2",
"Region": "us-east-2",
"AliasTarget": {
"HostedZoneId": "Z3AADJGX6KTTL2",
"DNSName": "ALB_USE2_HOSTNAME",
"EvaluateTargetHealth": true
}
}
},
{
"Action": "UPSERT",
"ResourceRecordSet": {
"Name": "api.example.com",
"Type": "A",
"SetIdentifier": "use1",
"Region": "us-east-1",
"AliasTarget": {
"HostedZoneId": "Z35SXDOTRQ7X7K",
"DNSName": "ALB_USE1_HOSTNAME",
"EvaluateTargetHealth": true
}
}
}
]
}
Note:
HostedZoneIdvalues above are examples for ALB aliases and vary by region. Check AWS docs for the correct alias hosted zone IDs for each region, then substitute.
This gives you active–active traffic steering and fast failover.
5) CI/CD tweaks for HTTPS & image rollout
In your GitHub Actions:
- Add
HOSTNAME=api.example.comas an env/secret. - After applying
ingress.yaml, you can verify:
curl -I https://api.example.com/health
If you want blue/green or canary rollouts, add a second Deployment and route by path/header using the ALB Ingress Controller annotations, or switch to a Service Mesh later.
Quick “interview lines” you can read
- HTTPS: “We use ACM for certs, ALB terminates TLS, and we force redirect from 80→443 at the Ingress.”
-
DNS: “Route 53 CNAME/Alias maps
api.example.comto the ALB; health checks enable failover.” - Global Tables: “DynamoDB Global Tables give us multi-region active–active with low RPO/RTO.”
-
End-to-end: “Users hit
https://api.example.com, Route 53 resolves to the closest ALB, ALB → EKS pods, pods use IRSA to call DynamoDB over HTTPS; data is encrypted at rest with KMS; observability via Prometheus/Grafana.”
3 part:
This Runbook explains:
- How the system works
- What to check during failures
- Commands to run
- How to respond as SRE.
This becomes your production playbook.
✅ RUNBOOK — Profile Service (EKS + ALB + DynamoDB + IRSA)
Service Summary
User → Route53 → ALB (HTTPS) → EKS Pods → DynamoDB (Global Table)
Core Components
| Component | Purpose |
|---|---|
| ALB (Ingress) | Handles public HTTPS traffic |
| EKS Deployment | Runs the Python API |
| Service Account (IRSA) | Grants pod access to DynamoDB (no secrets) |
| DynamoDB Global Table | Stores profile data (multi-region) |
| Prometheus + Grafana | Metrics + Dashboards |
| CloudWatch Logs | Application logs |
1) Health Check
Check the ALB endpoint or custom domain:
curl -I https://api.example.com/health
Expected: 200 OK & JSON {"status": "ok"}
If down → Go to Section 2 (Ingress / ALB).
2) ALB / Ingress Debugging
Get Ingress status:
kubectl get ingress profile-api-ing -n profile
Get ALB hostname:
kubectl get ingress profile-api-ing -n profile -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'; echo
Check targets:
aws elbv2 describe-target-health \
--target-group-arn <TARGET_GROUP_ARN> \
--region us-east-2
If targets are unhealthy:
Go to Section 3 (Pod / App issues).
3) Pod / Application Debugging
Check Deployment state:
kubectl get deploy -n profile
Check running pods:
kubectl get pods -n profile -o wide
Check logs:
kubectl logs -n profile deploy/profile-api --tail=100
Restart / redeploy:
kubectl rollout restart deploy/profile-api -n profile
4) Database Connectivity Check
Exec into pod:
kubectl exec -it -n profile $(kubectl get pod -n profile -o jsonpath='{.items[0].metadata.name}') -- sh
DNS check:
nslookup profile-service-profiles.<AWS_region>.amazonaws.com
Network test:
apk add bind-tools curl # if your base image is minimal
curl $TABLE_ENDPOINT # should not timeout
If DNS or network fails:
- Check VPC routing, subnets, NACL, SG-to-SG rules
If authentication fails:
- Go to Section 5 (IRSA).
5) IRSA Role Debugging (Pod → DynamoDB Access)
Confirm pod has correct IAM role:
kubectl describe sa app-sa -n profile
Look for:
eks.amazonaws.com/role-arn: arn:aws:iam::<ACCOUNT_ID>:role/profile-service-irsa-dynamodb
Check AWS permissions:
aws iam get-role --role-name profile-service-irsa-dynamodb
aws iam list-attached-role-policies --role-name profile-service-irsa-dynamodb
If denied DynamoDB:
Check CloudWatch Logs → “AccessDeniedException”
Fix by attaching correct IAM policy:
aws iam attach-role-policy \
--role-name profile-service-irsa-dynamodb \
--policy-arn arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess
(For production, use least privilege – the original custom policy.)
6) DynamoDB Data Debugging
Check if record exists:
aws dynamodb get-item \
--table-name profile-service-profiles \
--key '{"userId": {"S": "123"}}' \
--region us-east-2
Write test:
aws dynamodb put-item \
--table-name profile-service-profiles \
--item '{"userId": {"S": "test"}, "name": {"S": "Debug"}}' \
--region us-east-2
If table works → issue is app config / IRSA.
7) Global Table (DR / Multi-Region)
Check replication:
aws dynamodb describe-table \
--table-name profile-service-profiles \
--region us-east-2 \
--query "Table.Replicas"
If us-east-2 fails:
Change DNS routing to us-east-1:
aws route53 change-resource-record-sets \
--hosted-zone-id <ZONE_ID> \
--change-batch file://failover-us-east-1.json
This is instant failover.
8) SLO / Metrics / Observability Checks
App success rate (SLI):
View Grafana dashboard:
Grafana → Explore → metric: request_success_rate
Error budget check:
If success rate < SLO: 99.9% → trigger alert → stop feature releases, start reliability improvement.
✅ 15-Second Interview Summary Line (Read This)
“I built a fully automated EKS + DynamoDB system in us-east-2 with Terraform, IRSA security, ALB HTTPS, Route 53 DNS, Prometheus/Grafana observability, and DynamoDB Global Tables for active-active DR. I have a full Runbook to troubleshoot networking, app, IAM, and database issues in a structured, SRE-driven way.”
\
A) VISUAL DIAGRAM SLIDES (Explain Like You’re Presenting)
Use these as speaking slides or to draw on whiteboard.
Slide 1 — High-Level Architecture
+---------------------+
User → Internet →| Route 53 (DNS) |
+----------+----------+
|
v
+--------+--------+
| AWS ALB (HTTPS) |
+--------+--------+
|
v
+----------+-----------+
| EKS Cluster |
| (profile namespace) |
+----------+-----------+
|
+------------------+-------------------+
| |
+-------v-------+ +-------v--------+
| Profile API | (Python/Flask Pods) | ServiceAccount |
| Deployment |---------------------->| IRSA IAM Role |
+-------+-------+ +-------+--------+
| |
v v
(boto3 HTTPS) IAM Allow DynamoDB Read/Write
|
v
+-------+--------------------+
| DynamoDB Global Table |
| (us-east-2 <-> us-east-1) |
+----------------------------+
How to speak it:
“User hits a friendly domain in Route 53. Route 53 points to an ALB that terminates HTTPS. ALB forwards to EKS pods running the Profile API. The pods authenticate to DynamoDB using IRSA, which means no stored credentials. DynamoDB Global Tables replicate data across regions for DR.”
Slide 2 — Observability & Logs
EKS Pods → stdout/stderr → CloudWatch Logs
EKS Metrics → Prometheus → Grafana Dashboards
SLO Alerts → Prometheus AlertManager → Slack / Email
Speak it:
“Logs go to CloudWatch. Metrics go to Prometheus and are visualized in Grafana. Alerts are based on SLOs like request success rate and latency.”
Slide 3 — Disaster Recovery
Region A (us-east-2) — Active
Region B (us-east-1) — Active (Replica)
DynamoDB Global Tables keep data synced in real time.
Route 53 Latency Routing sends users to nearest healthy region.
If region fails, Route 53 automatically fails over.
Speak it:
“We don’t restore from backup; we shift traffic. It’s active-active high availability.”
B) MOCK INTERVIEW Q&A (Based on This Project)
✅ 1. Tell me about your project.
Answer:
I built a Profile Service on AWS using EKS, DynamoDB, and Terraform. The application is a Python Flask API that stores user profiles in DynamoDB. Everything is deployed using CI/CD. The pods authenticate to DynamoDB using IRSA, which avoids storing secrets. The system is fronted by an ALB with HTTPS managed through ACM and is exposed via Route 53. I also enabled Prometheus and Grafana for alerts and dashboards and configured DynamoDB Global Tables for multi-region resilience.
✅ 2. How do you secure communication?
Answer:
- In Transit: HTTPS via ALB + TLS certificates from ACM.
- In Cluster: All pod-to-pod traffic goes through Kubernetes networking (can add service mesh if needed).
- To DynamoDB: boto3 → DynamoDB over TLS (HTTPS).
- At Rest: DynamoDB uses KMS-managed encryption.
✅ 3. How does the application access DynamoDB without storing credentials?
Answer:
I used IRSA — IAM Role for Service Accounts. The pod’s Kubernetes Service Account is linked to an IAM role that has DynamoDB permissions. So the pod automatically receives short-lived AWS credentials securely, with zero secrets stored.
✅ 4. What would you do if the app cannot reach DynamoDB?
Step-by-step answer:
-
kubectl logs→ check app errors -
nslookup <dynamodb endpoint>→ DNS -
aws iam get-role→ check IRSA role bound correctly -
VPC Flow Logs→ verify network -
put-itemtest → verify DB layer
✅ 5. Can DynamoDB be active-active?
Answer:
Yes. Using DynamoDB Global Tables. They replicate in near real-time between multiple AWS regions, enabling active-active applications and fast failover.
✅ 6. DevOps vs SRE?
Answer:
- DevOps focuses on CI/CD, automation, delivery speed.
- SRE focuses on reliability, SLOs, error budgets, incident response.
✅ 7. What metrics are your SLIs/SLOs?
Answer:
- SLI: Request success rate & latency.
- SLO: 99.9% success.
- SLA: 99.5% uptime commitment.
- Alert: If success < SLO → trigger investigation & stop releases.
✅ 8. What is your DR strategy?
Answer:
DynamoDB Global Tables + Route 53 Latency Routing allows multi-region active-active failover.
part 4:
This lab takes students from zero → running app on EKS → DynamoDB → CI/CD → Observability.
HANDS-ON LAB: Deploy Profile Service on AWS (EKS + DynamoDB + IRSA + ALB)
Prerequisites
Students need:
- AWS Account
- IAM user with Admin access (or appropriate roles)
- AWS CLI installed
- Kubectl installed
- Terraform installed
- Docker installed
STEP 1 — Clone Project Template
git clone https://github.com/your-org/profile-service.git
cd profile-service
If you don’t have a repo yet, I will create the GitHub repo structure next message.
STEP 2 — Configure AWS CLI
aws configure
Enter:
- AWS Access Key
- Secret
- Region → us-east-2
STEP 3 — Create the Infrastructure (Terraform)
cd terraform
terraform init
terraform apply -auto-approve
After it completes:
aws eks update-kubeconfig --name profile-service-eks --region us-east-2
Check connection:
kubectl get nodes
✔ If nodes appear → EKS cluster is ready.
STEP 4 — Deploy AWS Load Balancer Controller
helm repo add eks https://aws.github.io/eks-charts
helm upgrade --install aws-load-balancer-controller eks/aws-load-balancer-controller \
-n kube-system --create-namespace \
--set clusterName=profile-service-eks \
--set region=us-east-2 \
--set serviceAccount.create=true \
--set vpcId=$(aws eks describe-cluster --name profile-service-eks --region us-east-2 --query "cluster.resourcesVpcConfig.vpcId" --output text)
Verify:
kubectl get pod -n kube-system | grep aws-load-balancer
STEP 5 — Build & Push Application Image
Replace <ACCOUNT_ID>:
aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin <ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com
docker build -t profile-service-api ./app
docker tag profile-service-api:latest <ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com/profile-service-api:latest
docker push <ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com/profile-service-api:latest
STEP 6 — Deploy to Kubernetes
Replace ECR URL in Deployment:
Open:
k8s/deployment.yaml
Set:
image: <ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com/profile-service-api:latest
Apply all manifests:
kubectl apply -f k8s/namespace.yaml
kubectl apply -f k8s/deployment.yaml
kubectl apply -f k8s/service.yaml
kubectl apply -f k8s/ingress.yaml
STEP 7 — Get the Application URL
kubectl get ingress -n profile
Copy the ALB hostname and open:
http://<ALB_HOSTNAME>/health
Expected output:
{"status":"ok"}
STEP 8 — Test DynamoDB Integration
Create Profile:
curl -X POST http://<ALB_HOSTNAME>/profile \
-H "Content-Type: application/json" \
-d '{"userId": "student1", "name": "John"}'
Retrieve Profile:
curl http://<ALB_HOSTNAME>/profile/student1
✔ If you see JSON → App + DB + Network + IAM are working.
STEP 9 — Add CI/CD Pipeline (GitHub Actions)
- Push project to GitHub
- Go to GitHub → Repo → Settings → Secrets → Add:
AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY
ECR_REPO
- Create
.github/workflows/ci-cd.yaml
(You already have the file in cicd/github-actions.yaml)
Pipeline will:
- Validate config (Python)
- Build Docker image
- Push to ECR
- Deploy to EKS
STEP 10 — Observability Setup
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm upgrade --install kube-stack prometheus-community/kube-prometheus-stack \
-n monitoring --create-namespace
Then:
kubectl port-forward svc/kube-stack-grafana -n monitoring 3000:80
Open browser:
http://localhost:3000
Login:
username: admin
password: admin
Add dashboard:
- Kubernetes / Compute Resources / Workload
✅ LAB COMPLETED — Students Now Understand:
| Skill | Verified |
|---|---|
| Terraform IaC | ✅ |
| EKS Deployment | ✅ |
| ALB Ingress | ✅ |
| IAM IRSA Security | ✅ |
| DynamoDB Integration | ✅ |
| CI/CD Automation | ✅ |
| Observability (Grafana/Prometheus) | ✅ |
| Troubleshooting & SRE Practices | ✅ |
Top comments (0)