<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Aisalkyn Aidarova</title>
    <description>The latest articles on DEV Community by Aisalkyn Aidarova (@jumptotech).</description>
    <link>https://dev.to/jumptotech</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3549986%2F7a403ca9-0459-412a-b618-71699480d58d.png</url>
      <title>DEV Community: Aisalkyn Aidarova</title>
      <link>https://dev.to/jumptotech</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/jumptotech"/>
    <language>en</language>
    <item>
      <title>Full Lab: ECS Fargate Observability with EC2 Alloy</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Wed, 27 May 2026 03:13:52 +0000</pubDate>
      <link>https://dev.to/jumptotech/full-lab-ecs-fargate-observability-with-ec2-alloy-2045</link>
      <guid>https://dev.to/jumptotech/full-lab-ecs-fargate-observability-with-ec2-alloy-2045</guid>
      <description>&lt;h2&gt;
  
  
  Final Goal
&lt;/h2&gt;

&lt;p&gt;Build an observability system where:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 Alloy
   ↓ sends logs
ALB for Loki
   ↓ forwards traffic
Loki running in ECS/Fargate
   ↓ stores logs
Grafana running in ECS/Fargate
   ↓ visualizes logs
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Final architecture:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS/Fargate:
- prod-app-service
- prometheus-service
- grafana-service
- loki-service

EC2:
- Alloy agent
- Node Exporter
- system logs

Networking:
- ALB in front of Loki
- Target group using IP target type
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 1: ECS Cluster
&lt;/h1&gt;

&lt;h2&gt;
  
  
  1. Created ECS Cluster
&lt;/h2&gt;

&lt;p&gt;Cluster name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prod-observability
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Launch type:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;AWS Fargate
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Region:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;us-east-2
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Run observability services as serverless containers.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 2: IAM Roles
&lt;/h1&gt;

&lt;h2&gt;
  
  
  2. Task Execution Role
&lt;/h2&gt;

&lt;p&gt;Used:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecsTaskExecutionRole
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Allows ECS/Fargate to pull container images and send logs to CloudWatch.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Attached policy:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;AmazonECSTaskExecutionRolePolicy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  3. Task Role
&lt;/h2&gt;

&lt;p&gt;Used:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecsAppTaskRole
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Allows application containers to call AWS services if needed.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For this lab, it was not heavily used.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 3: Demo Application Service
&lt;/h1&gt;

&lt;h2&gt;
  
  
  4. Created demo app task definition
&lt;/h2&gt;

&lt;p&gt;Task family:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prod-observability
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Container name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;demo-app
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Final working image:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;nginx:latest
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;80
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why we changed image:&lt;/p&gt;

&lt;p&gt;Earlier image failed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ghcr.io/brancz/prometheus-example-app:v0.5.0
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Error:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;CannotPullContainerError 403 Forbidden
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Reason:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS could not pull from GitHub Container Registry anonymously.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Fix:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Use public nginx image.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  5. Created demo app ECS service
&lt;/h2&gt;

&lt;p&gt;Service name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prod-app-service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Desired tasks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Running
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 4: Prometheus Service
&lt;/h1&gt;

&lt;h2&gt;
  
  
  6. Created Prometheus task definition
&lt;/h2&gt;

&lt;p&gt;Task family:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Container name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Image URI:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prom/prometheus:latest
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;9090
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;CloudWatch logs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/ecs/prometheus-service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Stream prefix:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  7. Created Prometheus service
&lt;/h2&gt;

&lt;p&gt;Service name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus-service-2
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Desired tasks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Running
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus stores and queries metrics.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Important concept:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus = metrics database
Loki = logs database
Grafana = visualization
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 5: Grafana Service
&lt;/h1&gt;

&lt;h2&gt;
  
  
  8. Created Grafana task definition
&lt;/h2&gt;

&lt;p&gt;Task family:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Container name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Image URI:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana/grafana:latest
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;3000
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;CloudWatch logs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/ecs/grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Stream prefix:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  9. Created Grafana service
&lt;/h2&gt;

&lt;p&gt;Service name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana-service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Desired tasks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Running
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Opened Grafana:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;http://GRAFANA_PUBLIC_IP:3000
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Default login:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;admin
admin
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Grafana visualizes metrics and logs.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 6: Loki Service
&lt;/h1&gt;

&lt;h2&gt;
  
  
  10. Created Loki task definition
&lt;/h2&gt;

&lt;p&gt;Task family:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Container name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Image URI:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana/loki:latest
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;CloudWatch logs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/ecs/loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Stream prefix:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  11. Created Loki ECS service
&lt;/h2&gt;

&lt;p&gt;Service name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki-service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Desired tasks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Running
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Purpose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Loki receives and stores logs.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 7: Why Alloy Failed in Fargate
&lt;/h1&gt;

&lt;h2&gt;
  
  
  12. Tried Alloy in ECS/Fargate
&lt;/h2&gt;

&lt;p&gt;Image used:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;grafana/alloy:latest
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;12345
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;It kept failing with:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Rollback failed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Reason:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Alloy is not like nginx, Grafana, or Loki.
Alloy is an agent.
It needs a config file.
Without config, Alloy exits.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;We tried:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;run
run,/etc/alloy/fargate.alloy
empty command
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;But it failed because:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/etc/alloy/fargate.alloy did not exist in Fargate container.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Conclusion:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Alloy is easier on EC2 because EC2 has normal Linux filesystem and config files.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 8: Decision — Use EC2 for Alloy
&lt;/h1&gt;

&lt;h2&gt;
  
  
  13. Final architecture decision
&lt;/h2&gt;

&lt;p&gt;We kept ECS because the lab teaches:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS
Fargate
task definitions
services
networking
service communication
load balancing
observability
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;But we moved Alloy to EC2 because:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Alloy needs config files.
EC2 is easier for agents.
EC2 gives filesystem access.
EC2 is better for troubleshooting.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Final decision:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS/Fargate:
- demo app
- Prometheus
- Grafana
- Loki

EC2:
- Alloy
- Node Exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 9: Existing EC2 Alloy Machine
&lt;/h1&gt;

&lt;h2&gt;
  
  
  14. Checked Alloy on EC2
&lt;/h2&gt;

&lt;p&gt;Command:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;alloy &lt;span class="nt"&gt;--version&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Output showed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;alloy version v1.16.1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This confirmed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Alloy is installed directly on Linux, not Docker.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  15. Checked Alloy config
&lt;/h2&gt;

&lt;p&gt;Config file:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/etc/alloy/config.alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;It had:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight hcl"&gt;&lt;code&gt;&lt;span class="nx"&gt;local&lt;/span&gt;&lt;span class="err"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;file_match&lt;/span&gt; &lt;span class="s2"&gt;"system_logs"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;path_targets&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;
    &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="nx"&gt;__path__&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"/var/log/syslog"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="nx"&gt;job&lt;/span&gt;      &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"syslog"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="p"&gt;},&lt;/span&gt;
    &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="nx"&gt;__path__&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"/var/log/auth.log"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="nx"&gt;job&lt;/span&gt;      &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"auth"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="p"&gt;},&lt;/span&gt;
    &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="nx"&gt;__path__&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"/var/log/nginx/access.log"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="nx"&gt;job&lt;/span&gt;      &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"nginx_access"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="p"&gt;},&lt;/span&gt;
    &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="nx"&gt;__path__&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"/var/log/nginx/error.log"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="nx"&gt;job&lt;/span&gt;      &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"nginx_error"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="p"&gt;},&lt;/span&gt;
  &lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;

&lt;span class="nx"&gt;loki&lt;/span&gt;&lt;span class="err"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;source&lt;/span&gt;&lt;span class="err"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;file&lt;/span&gt; &lt;span class="s2"&gt;"log_scrape"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;targets&lt;/span&gt;    &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="nx"&gt;local&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;file_match&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;system_logs&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;targets&lt;/span&gt;
  &lt;span class="nx"&gt;forward_to&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nx"&gt;loki&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;write&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;local&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;receiver&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This means Alloy collects:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/var/log/syslog
/var/log/auth.log
/var/log/nginx/access.log
/var/log/nginx/error.log
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 10: Created ALB for Loki
&lt;/h1&gt;

&lt;h2&gt;
  
  
  16. Why ALB was needed
&lt;/h2&gt;

&lt;p&gt;EC2 Alloy cannot use ECS internal service name like:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki-service:3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Because that name works only inside ECS networking.&lt;/p&gt;

&lt;p&gt;So we created ALB in front of Loki.&lt;/p&gt;

&lt;p&gt;Flow:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 Alloy
   ↓
Loki ALB
   ↓
Loki ECS task
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  17. Created Application Load Balancer
&lt;/h2&gt;

&lt;p&gt;ALB name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki-alb
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Scheme:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Internet-facing
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Type:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Application Load Balancer
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;VPC:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;vpc-02703ab5833607268
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Listener:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;HTTP:3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;ALB DNS:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;loki-alb-838622355.us-east-2.elb.amazonaws.com
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 11: Created Target Group
&lt;/h1&gt;

&lt;h2&gt;
  
  
  18. Important mistake we fixed
&lt;/h2&gt;

&lt;p&gt;First target group was created as:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Target type: Instance
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That was wrong for Fargate.&lt;/p&gt;

&lt;p&gt;Fargate requires:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Target type: IP
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Because Fargate tasks use ENIs and private IPs.&lt;/p&gt;




&lt;h2&gt;
  
  
  19. Correct Target Group
&lt;/h2&gt;

&lt;p&gt;Target group name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Loki-target-gr
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Target type:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;IP
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Protocol:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;HTTP
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Health check path:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/ready
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;VPC:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;vpc-02703ab5833607268
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 12: Registered Loki Task IP
&lt;/h1&gt;

&lt;h2&gt;
  
  
  20. Found Loki task private IP
&lt;/h2&gt;

&lt;p&gt;Went to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → prod-observability → Tasks → Loki task → Networking
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Found:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Private IP: 172.31.12.112
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  21. Registered target
&lt;/h2&gt;

&lt;p&gt;Went to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 → Target Groups → Loki-target-gr → Register targets
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Added:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;IP: 172.31.12.112
Port: 3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Healthy = 1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This confirmed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ALB can reach Loki ECS task.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 13: Fixed 503 Error
&lt;/h1&gt;

&lt;h2&gt;
  
  
  22. Browser showed:
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;503 Service Temporarily Unavailable
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Meaning:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ALB works, but no healthy target was registered.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;After registering:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;172.31.12.112:3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Target became:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Healthy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Then ALB worked.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 14: Updated Alloy Config
&lt;/h1&gt;

&lt;h2&gt;
  
  
  23. Old Alloy Loki URL
&lt;/h2&gt;

&lt;p&gt;Old config pointed to local Loki:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight hcl"&gt;&lt;code&gt;&lt;span class="nx"&gt;url&lt;/span&gt; &lt;span class="err"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"http://localhost:3100/loki/api/v1/push"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That meant:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Send logs to Loki running on same EC2.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;But now Loki is in ECS.&lt;/p&gt;




&lt;h2&gt;
  
  
  24. New Alloy Loki URL
&lt;/h2&gt;

&lt;p&gt;Changed to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight hcl"&gt;&lt;code&gt;&lt;span class="nx"&gt;url&lt;/span&gt; &lt;span class="err"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"http://loki-alb-838622355.us-east-2.elb.amazonaws.com:3100/loki/api/v1/push"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Full section:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight hcl"&gt;&lt;code&gt;&lt;span class="nx"&gt;loki&lt;/span&gt;&lt;span class="err"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;write&lt;/span&gt; &lt;span class="s2"&gt;"local"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;endpoint&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="nx"&gt;url&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"http://loki-alb-838622355.us-east-2.elb.amazonaws.com:3100/loki/api/v1/push"&lt;/span&gt;
  &lt;span class="p"&gt;}&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Saved file:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;CTRL + O
ENTER
CTRL + X
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Restarted Alloy:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl restart alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Checked status:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 15: Grafana Loki Datasource
&lt;/h1&gt;

&lt;h2&gt;
  
  
  25. In Grafana
&lt;/h2&gt;

&lt;p&gt;Went to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Connections → Data sources
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Selected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;URL:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;http://loki-alb-838622355.us-east-2.elb.amazonaws.com:3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Then:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Save &amp;amp; Test
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 16: Query Logs in Grafana
&lt;/h1&gt;

&lt;h2&gt;
  
  
  26. First mistake
&lt;/h2&gt;

&lt;p&gt;You queried this inside Prometheus:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That showed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;No data
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Reason:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus is for metrics.
Loki is for logs.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  27. Correct query
&lt;/h2&gt;

&lt;p&gt;Changed datasource from:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Then ran:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Result:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;84 lines displayed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Logs appeared successfully.&lt;/p&gt;




&lt;h1&gt;
  
  
  Final Working Queries
&lt;/h1&gt;

&lt;p&gt;Use these in Grafana Explore with Loki datasource:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="auth"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="nginx_access"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="nginx_error"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Final Working Architecture
&lt;/h1&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 Machine
  ├── Alloy
  ├── Node Exporter
  └── Linux logs
        ↓
Application Load Balancer
        ↓
ECS Fargate Loki Service
        ↓
Grafana Loki Datasource
        ↓
Grafana Explore
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Full observability stack:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS/Fargate:
  ├── prod-app-service
  ├── prometheus-service-2
  ├── grafana-service
  └── loki-service

EC2:
  └── Alloy agent
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  What Each Tool Does
&lt;/h1&gt;

&lt;h2&gt;
  
  
  ECS
&lt;/h2&gt;

&lt;p&gt;Runs containers as services.&lt;/p&gt;

&lt;h2&gt;
  
  
  Fargate
&lt;/h2&gt;

&lt;p&gt;Serverless compute for containers.&lt;/p&gt;

&lt;h2&gt;
  
  
  Demo App
&lt;/h2&gt;

&lt;p&gt;Application container.&lt;/p&gt;

&lt;h2&gt;
  
  
  Prometheus
&lt;/h2&gt;

&lt;p&gt;Stores metrics.&lt;/p&gt;

&lt;h2&gt;
  
  
  Loki
&lt;/h2&gt;

&lt;p&gt;Stores logs.&lt;/p&gt;

&lt;h2&gt;
  
  
  Grafana
&lt;/h2&gt;

&lt;p&gt;Visualizes logs and metrics.&lt;/p&gt;

&lt;h2&gt;
  
  
  Alloy
&lt;/h2&gt;

&lt;p&gt;Collects logs and sends them to Loki.&lt;/p&gt;

&lt;h2&gt;
  
  
  ALB
&lt;/h2&gt;

&lt;p&gt;Exposes Loki from ECS so EC2 Alloy can send logs to it.&lt;/p&gt;

&lt;h2&gt;
  
  
  Target Group
&lt;/h2&gt;

&lt;p&gt;Connects ALB to Loki ECS task private IP.&lt;/p&gt;




&lt;h1&gt;
  
  
  Most Important Lessons
&lt;/h1&gt;

&lt;h2&gt;
  
  
  1. Fargate hides the server
&lt;/h2&gt;

&lt;p&gt;That is why agents like Alloy are harder in Fargate.&lt;/p&gt;

&lt;h2&gt;
  
  
  2. Alloy needs config
&lt;/h2&gt;

&lt;p&gt;It cannot run empty.&lt;/p&gt;

&lt;h2&gt;
  
  
  3. Fargate target groups must use IP
&lt;/h2&gt;

&lt;p&gt;Not Instance.&lt;/p&gt;

&lt;h2&gt;
  
  
  4. 503 from ALB means no healthy target
&lt;/h2&gt;

&lt;p&gt;The ALB was working, but target group was empty/unhealthy.&lt;/p&gt;

&lt;h2&gt;
  
  
  5. Prometheus is not for logs
&lt;/h2&gt;

&lt;p&gt;Prometheus = metrics.&lt;/p&gt;

&lt;h2&gt;
  
  
  6. Loki is for logs
&lt;/h2&gt;

&lt;p&gt;Loki + Grafana Explore shows log lines.&lt;/p&gt;

&lt;h2&gt;
  
  
  7. Hybrid architecture is realistic
&lt;/h2&gt;

&lt;p&gt;EC2 agent → ALB → ECS Loki → Grafana is a real SRE-style pattern.&lt;/p&gt;




&lt;h1&gt;
  
  
  Final Success Proof
&lt;/h1&gt;

&lt;p&gt;Grafana showed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;84 lines displayed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That means the full pipeline works:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 logs
→ Alloy
→ Loki ALB
→ ECS Loki
→ Grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This lab is complete.&lt;/p&gt;

</description>
    </item>
    <item>
      <title>project #4: I Built a Full-Stack School Website in 1 Day with Claude AI — Here's the Complete Guide</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Tue, 26 May 2026 13:56:16 +0000</pubDate>
      <link>https://dev.to/jumptotech/i-built-a-full-stack-school-website-in-1-day-with-claude-ai-heres-the-complete-guide-39pn</link>
      <guid>https://dev.to/jumptotech/i-built-a-full-stack-school-website-in-1-day-with-claude-ai-heres-the-complete-guide-39pn</guid>
      <description>&lt;p&gt;&lt;strong&gt;Tags:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;devops, webdev, tutorial, beginners
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Cover image:&lt;/strong&gt; Take a screenshot of your jump2techdevops.com homepage&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Article body — paste this:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight markdown"&gt;&lt;code&gt;&lt;span class="gh"&gt;# I Built a Full-Stack School Website in 1 Day with Claude AI&lt;/span&gt;

No coding experience. No developer. $9.77 total cost.

Here is exactly what I built and how you can do the same for your business.

&lt;span class="gu"&gt;## What I Built&lt;/span&gt;

Live at: &lt;span class="gs"&gt;**jump2techdevops.com**&lt;/span&gt;
&lt;span class="p"&gt;
-&lt;/span&gt; Homepage with countdown timer, registration form, FAQ in English and Russian
&lt;span class="p"&gt;-&lt;/span&gt; Student login and registration with a real database
&lt;span class="p"&gt;-&lt;/span&gt; 13 course modules with Udemy-style lecture pages
&lt;span class="p"&gt;-&lt;/span&gt; Admin dashboard to manage students and add lectures without code
&lt;span class="p"&gt;-&lt;/span&gt; Automated tests with Playwright
&lt;span class="p"&gt;-&lt;/span&gt; Deployed on Cloudflare Pages with custom domain

&lt;span class="gu"&gt;## The Tech Stack&lt;/span&gt;

| Tool | Purpose | Cost |
|------|---------|------|
| Next.js 15 | Website framework | Free |
| TypeScript | Type-safe JavaScript | Free |
| Tailwind CSS | Styling | Free |
| Supabase | Database + Auth | Free |
| Cloudflare Pages | Hosting | Free |
| Cloudflare Registrar | Domain | $9.77/yr |
| Claude AI + Code | Wrote all the code | Free |
| Playwright | Automated testing | Free |

&lt;span class="gs"&gt;**Total cost: $9.77**&lt;/span&gt;
&lt;span class="p"&gt;
---
&lt;/span&gt;
&lt;span class="gu"&gt;## Phase 1 — Create the Project&lt;/span&gt;

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
cd ~&lt;br&gt;
mkdir projects &amp;amp;&amp;amp; cd projects&lt;br&gt;
npx create-next-app@latest my-school-website&lt;br&gt;
cd my-school-website&lt;br&gt;
npm run dev&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Open http://localhost:3000 — you see the default Next.js page.
This is running ONLY on your Mac. No one else can see it yet.

Then open Claude Code:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
claude&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Claude Code is your AI engineer. It reads your files and writes 
code directly on your computer.

---

## Phase 2 — Build the Homepage with AI

In the Claude Code prompt, describe what you want:

&amp;gt; Build a professional DevOps school homepage with:
&amp;gt; hero section, countdown timer to June 1 2026, 
&amp;gt; stats bar, 13 module cards, registration form,
&amp;gt; student testimonials, FAQ in English and Russian,
&amp;gt; sticky bottom bar with $700/month price.
&amp;gt; Primary color: #185FA5

Press Enter. Claude writes all the files.

**This is prompt engineering** — describing exactly what you want.
The more detail you give, the better the result.

---

## Phase 3 — Set Up the Database

Create a free account at supabase.com.

Create a new project, then run this SQL to create your tables:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
sql&lt;br&gt;
-- Store student registrations&lt;br&gt;
CREATE TABLE IF NOT EXISTS profiles (&lt;br&gt;
  id UUID PRIMARY KEY,&lt;br&gt;
  full_name TEXT,&lt;br&gt;
  email TEXT,&lt;br&gt;
  phone TEXT,&lt;br&gt;
  experience_level TEXT,&lt;br&gt;
  created_at TIMESTAMPTZ DEFAULT NOW()&lt;br&gt;
);&lt;br&gt;
ALTER TABLE profiles ENABLE ROW LEVEL SECURITY;&lt;br&gt;
CREATE POLICY 'insert' ON profiles FOR INSERT WITH CHECK (true);&lt;br&gt;
CREATE POLICY 'select' ON profiles FOR SELECT USING (true);&lt;/p&gt;

&lt;p&gt;-- Store lectures added through admin panel&lt;br&gt;&lt;br&gt;
CREATE TABLE IF NOT EXISTS lectures (&lt;br&gt;
  id UUID DEFAULT gen_random_uuid() PRIMARY KEY,&lt;br&gt;
  module_slug TEXT NOT NULL,&lt;br&gt;
  title TEXT NOT NULL,&lt;br&gt;
  content TEXT,&lt;br&gt;
  type TEXT DEFAULT 'reading',&lt;br&gt;
  order_index INTEGER DEFAULT 0,&lt;br&gt;
  created_at TIMESTAMPTZ DEFAULT NOW()&lt;br&gt;
);&lt;br&gt;
ALTER TABLE lectures ENABLE ROW LEVEL SECURITY;&lt;br&gt;
CREATE POLICY 'read' ON lectures FOR SELECT USING (true);&lt;br&gt;
CREATE POLICY 'admin' ON lectures FOR ALL USING (true);&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Connect to your website:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npm install @supabase/supabase-js&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Create `src/lib/supabase.ts`:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
typescript&lt;br&gt;
import { createClient } from '@supabase/supabase-js'&lt;/p&gt;

&lt;p&gt;export const supabase = createClient(&lt;br&gt;
  process.env.NEXT_PUBLIC_SUPABASE_URL!,&lt;br&gt;
  process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!&lt;br&gt;
)&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Create `.env.local`:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;NEXT_PUBLIC_SUPABASE_URL=&lt;a href="https://your-project.supabase.co" rel="noopener noreferrer"&gt;https://your-project.supabase.co&lt;/a&gt;&lt;br&gt;
NEXT_PUBLIC_SUPABASE_ANON_KEY=eyJhbGci...&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
&amp;gt; ⚠️ `.env.local` is in `.gitignore` — it never goes to GitHub.
&amp;gt; Never hardcode secrets in your code.

---

## Phase 4 — Student Login &amp;amp; Registration

Tell Claude Code:

&amp;gt; Create /register page with: Full Name, Email, Password, 
&amp;gt; Phone, Experience Level dropdown. Call supabase.auth.signUp()
&amp;gt; then insert into profiles table.
&amp;gt;
&amp;gt; Create /login page with email and password.
&amp;gt; Call supabase.auth.signInWithPassword().
&amp;gt; On success redirect to /modules.
&amp;gt;
&amp;gt; Protect /modules — redirect to /login if no session.

---

## Phase 5 — Admin Dashboard

Tell Claude Code:

&amp;gt; Create /admin page with password protection.
&amp;gt; After login show two tabs:
&amp;gt; 1. Students: table from profiles table
&amp;gt; 2. Lectures: add/edit/delete lectures per module

Now you can see every registered student and manage 
all course content from your browser — no code needed.

---

## Phase 6 — Test with Playwright

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npm install --save-dev @playwright/test&lt;br&gt;
npx playwright install chromium&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Create `tests/site.spec.ts`:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
typescript&lt;br&gt;
import { test, expect } from '@playwright/test';&lt;/p&gt;

&lt;p&gt;test('homepage loads', async ({ page }) =&amp;gt; {&lt;br&gt;
  await page.goto('&lt;a href="http://localhost:3000'" rel="noopener noreferrer"&gt;http://localhost:3000'&lt;/a&gt;);&lt;br&gt;
  await expect(page.locator('h1')).toBeVisible();&lt;br&gt;
});&lt;/p&gt;

&lt;p&gt;test('modules redirect to login', async ({ page }) =&amp;gt; {&lt;br&gt;
  await page.goto('&lt;a href="http://localhost:3000/modules'" rel="noopener noreferrer"&gt;http://localhost:3000/modules'&lt;/a&gt;);&lt;br&gt;
  await expect(page).toHaveURL(/login/);&lt;br&gt;
});&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npx playwright test&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
All green = safe to deploy.

---

## Phase 7 — Deploy to Cloudflare Pages

Add static export to `next.config.ts`:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
typescript&lt;br&gt;
const nextConfig = {&lt;br&gt;
  output: 'export',&lt;br&gt;
  trailingSlash: true,&lt;br&gt;
  images: { unoptimized: true }&lt;br&gt;
};&lt;br&gt;
export default nextConfig;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Build and deploy:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npm run build&lt;br&gt;
npm install -g wrangler&lt;br&gt;
npx wrangler login&lt;br&gt;
npx wrangler pages deploy out --project-name my-school&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Your site is live at `https://my-school.pages.dev`

Add Supabase keys to Cloudflare:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npx wrangler pages secret put NEXT_PUBLIC_SUPABASE_URL&lt;br&gt;
npx wrangler pages secret put NEXT_PUBLIC_SUPABASE_ANON_KEY&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
Redeploy:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;br&gt;
npx wrangler pages deploy out --project-name my-school&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
---

## Phase 8 — Connect Custom Domain

Buy your domain at cloudflare.com/registrar (~$9.77/year).

Then: Workers &amp;amp; Pages → your project → Custom domains
→ Set up custom domain → type your domain → Activate.

Since domain and hosting are both on Cloudflare — 
connection is instant. No DNS propagation wait.

---

## The Daily Workflow

Every time you change something:

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;br&gt;
bash&lt;/p&gt;
&lt;h1&gt;
  
  
  1. Test locally
&lt;/h1&gt;

&lt;p&gt;npm run dev&lt;/p&gt;
&lt;h1&gt;
  
  
  2. Run tests
&lt;/h1&gt;

&lt;p&gt;npx playwright test&lt;/p&gt;
&lt;h1&gt;
  
  
  3. Build
&lt;/h1&gt;

&lt;p&gt;npm run build&lt;/p&gt;
&lt;h1&gt;
  
  
  4. Deploy
&lt;/h1&gt;

&lt;p&gt;npx wrangler pages deploy out --project-name my-school&lt;/p&gt;
&lt;h1&gt;
  
  
  5. Save to GitHub
&lt;/h1&gt;

&lt;p&gt;git add .&lt;br&gt;
git commit -m "describe what changed"&lt;br&gt;
git push&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
---

## What This Taught Me

Building this website taught me more DevOps than any course:

- **Next.js** — how modern web frameworks work
- **Supabase** — database design, SQL, Row Level Security
- **Environment variables** — keeping secrets safe
- **Static export** — how websites are built for production
- **Cloudflare Pages** — CDN, edge hosting, custom domains
- **Playwright** — automated testing before every deploy
- **Git workflow** — committing, pushing, version control
- **Wrangler CLI** — command-line deployment

Every tool here is used by real companies.
Netflix, Airbnb, and Google engineers use these same tools.

---

## Resources

- Live site: https://jump2techdevops.com
- GitHub: https://github.com/jumptotechschooldevops/jumptotech-website
- Next.js docs: https://nextjs.org/docs
- Supabase docs: https://supabase.com/docs
- Cloudflare Pages: https://pages.cloudflare.com

**Batch 4 starts June 1, 2026 — Mon–Fri 6PM–9PM — $700/month**

If you want to learn DevOps properly — Docker, Kubernetes, 
AWS, Terraform, CI/CD, and more — come join us.

*Questions? Drop them in the comments below.*
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
    </item>
    <item>
      <title>lecture: ECS Fargate Prometheus Grafana Loki Alloy Node Exporter</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Tue, 26 May 2026 13:55:30 +0000</pubDate>
      <link>https://dev.to/jumptotech/lecture-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-57c2</link>
      <guid>https://dev.to/jumptotech/lecture-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-57c2</guid>
      <description>&lt;h1&gt;
  
  
  Part 1 — Introduction
&lt;/h1&gt;

&lt;p&gt;This lecture explains:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;What cloud infrastructure really is&lt;/li&gt;
&lt;li&gt;Difference between EC2, ECS, Fargate, Kubernetes, Load Balancer&lt;/li&gt;
&lt;li&gt;Why companies use cloud&lt;/li&gt;
&lt;li&gt;Why SRE and DevOps engineers exist&lt;/li&gt;
&lt;li&gt;What observability means&lt;/li&gt;
&lt;li&gt;Difference between metrics and logs&lt;/li&gt;
&lt;li&gt;Why we use Prometheus, Grafana, Loki, Alloy, and Node Exporter&lt;/li&gt;
&lt;li&gt;Real production architecture&lt;/li&gt;
&lt;li&gt;Real troubleshooting scenarios&lt;/li&gt;
&lt;li&gt;ECS/Fargate deployment flow&lt;/li&gt;
&lt;li&gt;Why sidecars exist&lt;/li&gt;
&lt;li&gt;Why modern systems use centralized observability&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This lecture is based on real troubleshooting and deployment scenarios.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 2 — What Is Really Happening In The Cloud?
&lt;/h1&gt;

&lt;p&gt;Many beginners think:&lt;/p&gt;

&lt;p&gt;"AWS is just hosting my website."&lt;/p&gt;

&lt;p&gt;But in reality AWS provides:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Physical data centers&lt;/li&gt;
&lt;li&gt;Servers&lt;/li&gt;
&lt;li&gt;Networking&lt;/li&gt;
&lt;li&gt;Internet routing&lt;/li&gt;
&lt;li&gt;Storage&lt;/li&gt;
&lt;li&gt;Virtualization&lt;/li&gt;
&lt;li&gt;Hypervisors&lt;/li&gt;
&lt;li&gt;Security&lt;/li&gt;
&lt;li&gt;Scaling infrastructure&lt;/li&gt;
&lt;li&gt;High availability&lt;/li&gt;
&lt;li&gt;Global infrastructure&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;When you deploy an application to AWS:&lt;/p&gt;

&lt;p&gt;You are renting compute resources from AWS.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 3 — Server vs Virtual Machine
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Physical Server
&lt;/h2&gt;

&lt;p&gt;A physical server is:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Real hardware&lt;/li&gt;
&lt;li&gt;CPU&lt;/li&gt;
&lt;li&gt;RAM&lt;/li&gt;
&lt;li&gt;Storage&lt;/li&gt;
&lt;li&gt;Network cards&lt;/li&gt;
&lt;li&gt;Power supply&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Inside AWS data centers there are thousands of physical servers.&lt;/p&gt;




&lt;h2&gt;
  
  
  Virtual Machine (VM)
&lt;/h2&gt;

&lt;p&gt;A virtual machine is:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Software-defined computer&lt;/li&gt;
&lt;li&gt;Runs on top of physical server&lt;/li&gt;
&lt;li&gt;Has virtual CPU&lt;/li&gt;
&lt;li&gt;Virtual RAM&lt;/li&gt;
&lt;li&gt;Virtual storage&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;One physical server can run many virtual machines.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 4 — Why Cloud Exists
&lt;/h1&gt;

&lt;p&gt;Without cloud:&lt;/p&gt;

&lt;p&gt;You would need:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Your own server room&lt;/li&gt;
&lt;li&gt;Cooling&lt;/li&gt;
&lt;li&gt;Electricity&lt;/li&gt;
&lt;li&gt;Networking&lt;/li&gt;
&lt;li&gt;Internet provider&lt;/li&gt;
&lt;li&gt;Firewalls&lt;/li&gt;
&lt;li&gt;Routers&lt;/li&gt;
&lt;li&gt;Hardware replacement&lt;/li&gt;
&lt;li&gt;OS patching&lt;/li&gt;
&lt;li&gt;Security&lt;/li&gt;
&lt;li&gt;Scaling&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Cloud providers solve this.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 5 — Docker
&lt;/h1&gt;

&lt;p&gt;Docker solves:&lt;/p&gt;

&lt;p&gt;"How do we package applications consistently?"&lt;/p&gt;

&lt;p&gt;Docker container includes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Application&lt;/li&gt;
&lt;li&gt;Libraries&lt;/li&gt;
&lt;li&gt;Dependencies&lt;/li&gt;
&lt;li&gt;Runtime&lt;/li&gt;
&lt;li&gt;Configuration&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Container can run consistently:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;laptop&lt;/li&gt;
&lt;li&gt;EC2&lt;/li&gt;
&lt;li&gt;ECS&lt;/li&gt;
&lt;li&gt;Kubernetes&lt;/li&gt;
&lt;li&gt;cloud&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 6 — Why Docker Alone Is Not Enough
&lt;/h1&gt;

&lt;p&gt;If you run only one container:&lt;/p&gt;

&lt;p&gt;Docker alone may be enough.&lt;/p&gt;

&lt;p&gt;But large systems need:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;scaling&lt;/li&gt;
&lt;li&gt;failover&lt;/li&gt;
&lt;li&gt;networking&lt;/li&gt;
&lt;li&gt;deployment automation&lt;/li&gt;
&lt;li&gt;service discovery&lt;/li&gt;
&lt;li&gt;self-healing&lt;/li&gt;
&lt;li&gt;orchestration&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is why Kubernetes and ECS exist.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 7 — ECS vs Kubernetes
&lt;/h1&gt;

&lt;h2&gt;
  
  
  ECS
&lt;/h2&gt;

&lt;p&gt;AWS-native container orchestrator.&lt;/p&gt;

&lt;p&gt;Simpler.&lt;/p&gt;

&lt;p&gt;Good integration with AWS.&lt;/p&gt;




&lt;h2&gt;
  
  
  Kubernetes
&lt;/h2&gt;

&lt;p&gt;Industry-standard orchestration platform.&lt;/p&gt;

&lt;p&gt;More powerful.&lt;/p&gt;

&lt;p&gt;More complex.&lt;/p&gt;

&lt;p&gt;Used heavily by large enterprises.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 8 — What Is ECS?
&lt;/h1&gt;

&lt;p&gt;ECS = Elastic Container Service.&lt;/p&gt;

&lt;p&gt;ECS does:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Runs containers&lt;/li&gt;
&lt;li&gt;Restarts failed containers&lt;/li&gt;
&lt;li&gt;Deploys applications&lt;/li&gt;
&lt;li&gt;Scales containers&lt;/li&gt;
&lt;li&gt;Handles networking&lt;/li&gt;
&lt;li&gt;Manages task lifecycle&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;ECS is NOT:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Load balancer&lt;/li&gt;
&lt;li&gt;Database&lt;/li&gt;
&lt;li&gt;Monitoring system&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 9 — What Is Fargate?
&lt;/h1&gt;

&lt;p&gt;Fargate is serverless container infrastructure.&lt;/p&gt;

&lt;p&gt;AWS manages:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;servers&lt;/li&gt;
&lt;li&gt;patching&lt;/li&gt;
&lt;li&gt;hypervisor&lt;/li&gt;
&lt;li&gt;scaling infrastructure&lt;/li&gt;
&lt;li&gt;hardware&lt;/li&gt;
&lt;li&gt;OS maintenance&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;You manage only:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;containers&lt;/li&gt;
&lt;li&gt;task definitions&lt;/li&gt;
&lt;li&gt;services&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 10 — ECS Cluster Does NOT Mean One Machine
&lt;/h1&gt;

&lt;p&gt;Important concept.&lt;/p&gt;

&lt;p&gt;ECS cluster is:&lt;/p&gt;

&lt;p&gt;Logical grouping for tasks/services.&lt;/p&gt;

&lt;p&gt;One ECS cluster can run:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;many tasks&lt;/li&gt;
&lt;li&gt;many services&lt;/li&gt;
&lt;li&gt;many applications&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;app-service&lt;/li&gt;
&lt;li&gt;grafana-service&lt;/li&gt;
&lt;li&gt;prometheus-service&lt;/li&gt;
&lt;li&gt;loki-service&lt;/li&gt;
&lt;li&gt;alloy-service&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;all inside ONE ECS cluster.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 11 — ECS Task Definition
&lt;/h1&gt;

&lt;p&gt;Task definition describes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;container image&lt;/li&gt;
&lt;li&gt;CPU&lt;/li&gt;
&lt;li&gt;memory&lt;/li&gt;
&lt;li&gt;ports&lt;/li&gt;
&lt;li&gt;environment variables&lt;/li&gt;
&lt;li&gt;IAM roles&lt;/li&gt;
&lt;li&gt;commands&lt;/li&gt;
&lt;li&gt;networking&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Task definition is like:&lt;/p&gt;

&lt;p&gt;Blueprint/template for containers.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 12 — ECS Service
&lt;/h1&gt;

&lt;p&gt;Service keeps tasks alive.&lt;/p&gt;

&lt;p&gt;If container crashes:&lt;/p&gt;

&lt;p&gt;ECS service recreates it automatically.&lt;/p&gt;

&lt;p&gt;This is real production behavior.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 13 — Load Balancer vs ECS
&lt;/h1&gt;

&lt;p&gt;Many beginners confuse this.&lt;/p&gt;

&lt;h2&gt;
  
  
  Load Balancer (ALB)
&lt;/h2&gt;

&lt;p&gt;ALB ONLY distributes traffic.&lt;/p&gt;

&lt;p&gt;ALB does NOT:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;run applications&lt;/li&gt;
&lt;li&gt;restart containers&lt;/li&gt;
&lt;li&gt;deploy apps&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;ALB routes traffic.&lt;/p&gt;




&lt;h2&gt;
  
  
  ECS
&lt;/h2&gt;

&lt;p&gt;ECS actually:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;launches containers&lt;/li&gt;
&lt;li&gt;restarts containers&lt;/li&gt;
&lt;li&gt;scales tasks&lt;/li&gt;
&lt;li&gt;deploys revisions&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 14 — Real Production Architecture
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Simple Architecture
&lt;/h2&gt;

&lt;p&gt;Users&lt;br&gt;
↓&lt;br&gt;
Public IP&lt;br&gt;
↓&lt;br&gt;
EC2&lt;br&gt;
↓&lt;br&gt;
Docker container&lt;/p&gt;

&lt;p&gt;Good for small applications.&lt;/p&gt;




&lt;h2&gt;
  
  
  Better Production Architecture
&lt;/h2&gt;

&lt;p&gt;Users&lt;br&gt;
↓&lt;br&gt;
CloudFront CDN&lt;br&gt;
↓&lt;br&gt;
ALB&lt;br&gt;
↓&lt;br&gt;
ECS Fargate&lt;br&gt;
↓&lt;br&gt;
Containers&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 15 — Why CloudFront Exists
&lt;/h1&gt;

&lt;p&gt;CloudFront is CDN.&lt;/p&gt;

&lt;p&gt;It distributes content globally.&lt;/p&gt;

&lt;p&gt;Without CDN:&lt;/p&gt;

&lt;p&gt;All users hit one region.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Only us-east-1.&lt;/p&gt;

&lt;p&gt;Users in Asia experience latency.&lt;/p&gt;

&lt;p&gt;CloudFront caches closer to users.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 16 — What Is SRE?
&lt;/h1&gt;

&lt;p&gt;SRE = Site Reliability Engineering.&lt;/p&gt;

&lt;p&gt;SRE focuses on:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;uptime&lt;/li&gt;
&lt;li&gt;monitoring&lt;/li&gt;
&lt;li&gt;reliability&lt;/li&gt;
&lt;li&gt;scaling&lt;/li&gt;
&lt;li&gt;observability&lt;/li&gt;
&lt;li&gt;alerting&lt;/li&gt;
&lt;li&gt;automation&lt;/li&gt;
&lt;li&gt;troubleshooting&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 17 — What Is Observability?
&lt;/h1&gt;

&lt;p&gt;Observability means:&lt;/p&gt;

&lt;p&gt;Understanding system behavior.&lt;/p&gt;

&lt;p&gt;Three major pillars:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Metrics&lt;/li&gt;
&lt;li&gt;Logs&lt;/li&gt;
&lt;li&gt;Traces&lt;/li&gt;
&lt;/ol&gt;




&lt;h1&gt;
  
  
  Part 18 — Metrics vs Logs
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Metrics
&lt;/h2&gt;

&lt;p&gt;Metrics answer:&lt;/p&gt;

&lt;p&gt;"WHAT is wrong?"&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU 90%&lt;/li&gt;
&lt;li&gt;Memory 85%&lt;/li&gt;
&lt;li&gt;Request latency&lt;/li&gt;
&lt;li&gt;Error rate&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Metrics are numerical time-series data.&lt;/p&gt;




&lt;h2&gt;
  
  
  Logs
&lt;/h2&gt;

&lt;p&gt;Logs answer:&lt;/p&gt;

&lt;p&gt;"WHY is it wrong?"&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;database timeout&lt;/li&gt;
&lt;li&gt;authentication failure&lt;/li&gt;
&lt;li&gt;stack trace&lt;/li&gt;
&lt;li&gt;nginx 500 error&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Logs are text.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 19 — Prometheus
&lt;/h1&gt;

&lt;p&gt;Prometheus stores metrics.&lt;/p&gt;

&lt;p&gt;Prometheus is:&lt;/p&gt;

&lt;p&gt;Time-series database.&lt;/p&gt;

&lt;p&gt;Prometheus stores:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU history&lt;/li&gt;
&lt;li&gt;Memory history&lt;/li&gt;
&lt;li&gt;Request history&lt;/li&gt;
&lt;li&gt;Error history&lt;/li&gt;
&lt;li&gt;Latency history&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Prometheus uses:&lt;/p&gt;

&lt;p&gt;PromQL.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 20 — Grafana
&lt;/h1&gt;

&lt;p&gt;Grafana visualizes telemetry.&lt;/p&gt;

&lt;p&gt;Grafana itself does NOT store:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;metrics&lt;/li&gt;
&lt;li&gt;logs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Grafana reads from:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Tempo&lt;/li&gt;
&lt;li&gt;CloudWatch&lt;/li&gt;
&lt;li&gt;Elasticsearch&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Grafana creates:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;dashboards&lt;/li&gt;
&lt;li&gt;alerts&lt;/li&gt;
&lt;li&gt;graphs&lt;/li&gt;
&lt;li&gt;log search&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 21 — Loki
&lt;/h1&gt;

&lt;p&gt;Loki stores logs centrally.&lt;/p&gt;

&lt;p&gt;Instead of:&lt;/p&gt;

&lt;p&gt;logging into every machine,&lt;/p&gt;

&lt;p&gt;Loki centralizes logs.&lt;/p&gt;

&lt;p&gt;All systems send logs into Loki.&lt;/p&gt;

&lt;p&gt;Grafana can search them.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 22 — Alloy
&lt;/h1&gt;

&lt;p&gt;Alloy is telemetry pipeline agent.&lt;/p&gt;

&lt;p&gt;Alloy can:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;collect metrics&lt;/li&gt;
&lt;li&gt;collect logs&lt;/li&gt;
&lt;li&gt;collect traces&lt;/li&gt;
&lt;li&gt;forward telemetry&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Alloy sends data to:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Tempo&lt;/li&gt;
&lt;li&gt;Grafana Cloud&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Important:&lt;/p&gt;

&lt;p&gt;Alloy is NOT main storage.&lt;/p&gt;

&lt;p&gt;Alloy transports telemetry.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 23 — Node Exporter
&lt;/h1&gt;

&lt;p&gt;Node Exporter exposes Linux host metrics.&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU usage&lt;/li&gt;
&lt;li&gt;RAM usage&lt;/li&gt;
&lt;li&gt;Disk usage&lt;/li&gt;
&lt;li&gt;Filesystem metrics&lt;/li&gt;
&lt;li&gt;Network metrics&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Node Exporter produces metrics endpoint:&lt;/p&gt;

&lt;p&gt;/metrics&lt;/p&gt;

&lt;p&gt;Prometheus scrapes it.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 24 — Why Node Exporter Is Different In Fargate
&lt;/h1&gt;

&lt;p&gt;In EC2:&lt;/p&gt;

&lt;p&gt;You control Linux host.&lt;/p&gt;

&lt;p&gt;You can install Node Exporter.&lt;/p&gt;




&lt;p&gt;In Fargate:&lt;/p&gt;

&lt;p&gt;AWS hides:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;host OS&lt;/li&gt;
&lt;li&gt;kernel&lt;/li&gt;
&lt;li&gt;hardware&lt;/li&gt;
&lt;li&gt;hypervisor&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;So you cannot install Node Exporter on Fargate host.&lt;/p&gt;

&lt;p&gt;Instead we use:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;ECS telemetry&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;li&gt;OpenTelemetry&lt;/li&gt;
&lt;li&gt;CloudWatch metrics&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 25 — Sidecar Containers
&lt;/h1&gt;

&lt;p&gt;Sidecar means:&lt;/p&gt;

&lt;p&gt;Second container inside same task/pod.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;application container&lt;/li&gt;
&lt;li&gt;alloy sidecar&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why?&lt;/p&gt;

&lt;p&gt;Sidecar can:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;collect local logs&lt;/li&gt;
&lt;li&gt;collect metrics&lt;/li&gt;
&lt;li&gt;forward telemetry&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 26 — Why We Separate Services
&lt;/h1&gt;

&lt;p&gt;Real production architecture separates:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;application&lt;/li&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why?&lt;/p&gt;

&lt;p&gt;Different scaling requirements.&lt;/p&gt;

&lt;p&gt;Different CPU usage.&lt;/p&gt;

&lt;p&gt;Different memory usage.&lt;/p&gt;

&lt;p&gt;Avoid single point of failure.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 27 — Final Production Architecture
&lt;/h1&gt;

&lt;p&gt;Internet&lt;br&gt;
↓&lt;br&gt;
CloudFront&lt;br&gt;
↓&lt;br&gt;
ALB&lt;br&gt;
↓&lt;br&gt;
ECS Fargate Application&lt;br&gt;
↓&lt;br&gt;
Metrics → Prometheus&lt;br&gt;
↓&lt;br&gt;
Logs → Alloy → Loki&lt;br&gt;
↓&lt;br&gt;
Grafana dashboards&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 28 — IAM Roles In ECS
&lt;/h1&gt;

&lt;p&gt;Two important roles.&lt;/p&gt;

&lt;h2&gt;
  
  
  Task Execution Role
&lt;/h2&gt;

&lt;p&gt;Used by ECS infrastructure.&lt;/p&gt;

&lt;p&gt;Allows:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;pulling images&lt;/li&gt;
&lt;li&gt;CloudWatch logs&lt;/li&gt;
&lt;li&gt;ECS startup actions&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Task Role
&lt;/h2&gt;

&lt;p&gt;Used by application container.&lt;/p&gt;

&lt;p&gt;Allows application access to:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;S3&lt;/li&gt;
&lt;li&gt;DynamoDB&lt;/li&gt;
&lt;li&gt;Secrets Manager&lt;/li&gt;
&lt;li&gt;AWS APIs&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 29 — Security Groups
&lt;/h1&gt;

&lt;p&gt;Security Groups are virtual firewalls.&lt;/p&gt;

&lt;p&gt;They control:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;inbound traffic&lt;/li&gt;
&lt;li&gt;outbound traffic&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Allow:&lt;/p&gt;

&lt;p&gt;HTTP 80&lt;br&gt;
HTTPS 443&lt;br&gt;
Grafana 3000&lt;br&gt;
Prometheus 9090&lt;br&gt;
Loki 3100&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 30 — ECS Troubleshooting Learned In Lab
&lt;/h1&gt;

&lt;p&gt;Real troubleshooting scenarios encountered:&lt;/p&gt;

&lt;h2&gt;
  
  
  Image Pull Failure
&lt;/h2&gt;

&lt;p&gt;CannotPullContainerError&lt;br&gt;
403 Forbidden&lt;/p&gt;

&lt;p&gt;Cause:&lt;/p&gt;

&lt;p&gt;Private registry permissions.&lt;/p&gt;

&lt;p&gt;Fix:&lt;/p&gt;

&lt;p&gt;Use public container image.&lt;/p&gt;




&lt;h2&gt;
  
  
  Deployment Rollback
&lt;/h2&gt;

&lt;p&gt;ECS deployment rollback failed.&lt;/p&gt;

&lt;p&gt;Cause:&lt;/p&gt;

&lt;p&gt;Containers failing during deployment.&lt;/p&gt;




&lt;h2&gt;
  
  
  Alloy Command Parsing Issue
&lt;/h2&gt;

&lt;p&gt;Wrong command:&lt;/p&gt;

&lt;p&gt;run /etc/alloy/fargate.alloy&lt;/p&gt;

&lt;p&gt;Correct ECS array syntax:&lt;/p&gt;

&lt;p&gt;run,/etc/alloy/fargate.alloy&lt;/p&gt;




&lt;h2&gt;
  
  
  Missing Config File
&lt;/h2&gt;

&lt;p&gt;Alloy failed because:&lt;/p&gt;

&lt;p&gt;/etc/alloy/fargate.alloy&lt;/p&gt;

&lt;p&gt;was not mounted.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 31 — Real SRE Workflow
&lt;/h1&gt;

&lt;p&gt;Real workflow:&lt;/p&gt;

&lt;p&gt;Deploy&lt;br&gt;
↓&lt;br&gt;
Observe failure&lt;br&gt;
↓&lt;br&gt;
Read logs&lt;br&gt;
↓&lt;br&gt;
Find root cause&lt;br&gt;
↓&lt;br&gt;
Fix configuration&lt;br&gt;
↓&lt;br&gt;
Redeploy&lt;br&gt;
↓&lt;br&gt;
Validate telemetry&lt;/p&gt;

&lt;p&gt;This is real production engineering.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 32 — Why Metrics And Logs Together Matter
&lt;/h1&gt;

&lt;p&gt;Metrics tell:&lt;/p&gt;

&lt;p&gt;WHAT is wrong.&lt;/p&gt;

&lt;p&gt;Logs tell:&lt;/p&gt;

&lt;p&gt;WHY it is wrong.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Metrics:&lt;/p&gt;

&lt;p&gt;CPU 95%&lt;/p&gt;

&lt;p&gt;Logs:&lt;/p&gt;

&lt;p&gt;Database timeout causing retries.&lt;/p&gt;

&lt;p&gt;Together they explain outages.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 33 — Why Modern Systems Need Observability
&lt;/h1&gt;

&lt;p&gt;Modern systems are distributed.&lt;/p&gt;

&lt;p&gt;Many:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;containers&lt;/li&gt;
&lt;li&gt;services&lt;/li&gt;
&lt;li&gt;APIs&lt;/li&gt;
&lt;li&gt;databases&lt;/li&gt;
&lt;li&gt;networks&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Without observability:&lt;/p&gt;

&lt;p&gt;troubleshooting becomes impossible.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 34 — What Students Learned In This Lab
&lt;/h1&gt;

&lt;p&gt;Students learned:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;ECS&lt;/li&gt;
&lt;li&gt;Fargate&lt;/li&gt;
&lt;li&gt;Task Definitions&lt;/li&gt;
&lt;li&gt;Services&lt;/li&gt;
&lt;li&gt;IAM Roles&lt;/li&gt;
&lt;li&gt;Security Groups&lt;/li&gt;
&lt;li&gt;CloudWatch Logs&lt;/li&gt;
&lt;li&gt;Deployment failures&lt;/li&gt;
&lt;li&gt;Rollbacks&lt;/li&gt;
&lt;li&gt;Container troubleshooting&lt;/li&gt;
&lt;li&gt;Sidecars&lt;/li&gt;
&lt;li&gt;Metrics vs logs&lt;/li&gt;
&lt;li&gt;Observability&lt;/li&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;li&gt;Distributed systems thinking&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 35 — Final Important Concepts
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Docker
&lt;/h2&gt;

&lt;p&gt;Packages application.&lt;/p&gt;




&lt;h2&gt;
  
  
  ECS/Kubernetes
&lt;/h2&gt;

&lt;p&gt;Runs applications reliably.&lt;/p&gt;




&lt;h2&gt;
  
  
  ALB
&lt;/h2&gt;

&lt;p&gt;Routes traffic.&lt;/p&gt;




&lt;h2&gt;
  
  
  CloudFront
&lt;/h2&gt;

&lt;p&gt;Distributes globally.&lt;/p&gt;




&lt;h2&gt;
  
  
  Prometheus
&lt;/h2&gt;

&lt;p&gt;Stores metrics.&lt;/p&gt;




&lt;h2&gt;
  
  
  Loki
&lt;/h2&gt;

&lt;p&gt;Stores logs.&lt;/p&gt;




&lt;h2&gt;
  
  
  Grafana
&lt;/h2&gt;

&lt;p&gt;Visualizes telemetry.&lt;/p&gt;




&lt;h2&gt;
  
  
  Alloy
&lt;/h2&gt;

&lt;p&gt;Collects/transports telemetry.&lt;/p&gt;




&lt;h2&gt;
  
  
  Node Exporter
&lt;/h2&gt;

&lt;p&gt;Produces Linux host metrics.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 36 — Enterprise SRE Mindset
&lt;/h1&gt;

&lt;p&gt;Modern SRE engineers think about:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;scalability&lt;/li&gt;
&lt;li&gt;observability&lt;/li&gt;
&lt;li&gt;automation&lt;/li&gt;
&lt;li&gt;reliability&lt;/li&gt;
&lt;li&gt;distributed systems&lt;/li&gt;
&lt;li&gt;telemetry&lt;/li&gt;
&lt;li&gt;infrastructure&lt;/li&gt;
&lt;li&gt;deployment safety&lt;/li&gt;
&lt;li&gt;failure recovery&lt;/li&gt;
&lt;li&gt;centralized monitoring&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is the foundation of modern cloud-native engineering.&lt;/p&gt;

</description>
    </item>
    <item>
      <title>lecture: ECS Fargate Prometheus Grafana Loki Alloy Node Exporter</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Mon, 25 May 2026 22:39:02 +0000</pubDate>
      <link>https://dev.to/jumptotech/lecture-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-4o40</link>
      <guid>https://dev.to/jumptotech/lecture-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-4o40</guid>
      <description>&lt;h1&gt;
  
  
  Part 1 — Introduction
&lt;/h1&gt;

&lt;p&gt;This lecture explains:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;What cloud infrastructure really is&lt;/li&gt;
&lt;li&gt;Difference between EC2, ECS, Fargate, Kubernetes, Load Balancer&lt;/li&gt;
&lt;li&gt;Why companies use cloud&lt;/li&gt;
&lt;li&gt;Why SRE and DevOps engineers exist&lt;/li&gt;
&lt;li&gt;What observability means&lt;/li&gt;
&lt;li&gt;Difference between metrics and logs&lt;/li&gt;
&lt;li&gt;Why we use Prometheus, Grafana, Loki, Alloy, and Node Exporter&lt;/li&gt;
&lt;li&gt;Real production architecture&lt;/li&gt;
&lt;li&gt;Real troubleshooting scenarios&lt;/li&gt;
&lt;li&gt;ECS/Fargate deployment flow&lt;/li&gt;
&lt;li&gt;Why sidecars exist&lt;/li&gt;
&lt;li&gt;Why modern systems use centralized observability&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This lecture is based on real troubleshooting and deployment scenarios.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 2 — What Is Really Happening In The Cloud?
&lt;/h1&gt;

&lt;p&gt;Many beginners think:&lt;/p&gt;

&lt;p&gt;"AWS is just hosting my website."&lt;/p&gt;

&lt;p&gt;But in reality AWS provides:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Physical data centers&lt;/li&gt;
&lt;li&gt;Servers&lt;/li&gt;
&lt;li&gt;Networking&lt;/li&gt;
&lt;li&gt;Internet routing&lt;/li&gt;
&lt;li&gt;Storage&lt;/li&gt;
&lt;li&gt;Virtualization&lt;/li&gt;
&lt;li&gt;Hypervisors&lt;/li&gt;
&lt;li&gt;Security&lt;/li&gt;
&lt;li&gt;Scaling infrastructure&lt;/li&gt;
&lt;li&gt;High availability&lt;/li&gt;
&lt;li&gt;Global infrastructure&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;When you deploy an application to AWS:&lt;/p&gt;

&lt;p&gt;You are renting compute resources from AWS.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 3 — Server vs Virtual Machine
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Physical Server
&lt;/h2&gt;

&lt;p&gt;A physical server is:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Real hardware&lt;/li&gt;
&lt;li&gt;CPU&lt;/li&gt;
&lt;li&gt;RAM&lt;/li&gt;
&lt;li&gt;Storage&lt;/li&gt;
&lt;li&gt;Network cards&lt;/li&gt;
&lt;li&gt;Power supply&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Inside AWS data centers there are thousands of physical servers.&lt;/p&gt;




&lt;h2&gt;
  
  
  Virtual Machine (VM)
&lt;/h2&gt;

&lt;p&gt;A virtual machine is:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Software-defined computer&lt;/li&gt;
&lt;li&gt;Runs on top of physical server&lt;/li&gt;
&lt;li&gt;Has virtual CPU&lt;/li&gt;
&lt;li&gt;Virtual RAM&lt;/li&gt;
&lt;li&gt;Virtual storage&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;One physical server can run many virtual machines.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 4 — Why Cloud Exists
&lt;/h1&gt;

&lt;p&gt;Without cloud:&lt;/p&gt;

&lt;p&gt;You would need:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Your own server room&lt;/li&gt;
&lt;li&gt;Cooling&lt;/li&gt;
&lt;li&gt;Electricity&lt;/li&gt;
&lt;li&gt;Networking&lt;/li&gt;
&lt;li&gt;Internet provider&lt;/li&gt;
&lt;li&gt;Firewalls&lt;/li&gt;
&lt;li&gt;Routers&lt;/li&gt;
&lt;li&gt;Hardware replacement&lt;/li&gt;
&lt;li&gt;OS patching&lt;/li&gt;
&lt;li&gt;Security&lt;/li&gt;
&lt;li&gt;Scaling&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Cloud providers solve this.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 5 — Docker
&lt;/h1&gt;

&lt;p&gt;Docker solves:&lt;/p&gt;

&lt;p&gt;"How do we package applications consistently?"&lt;/p&gt;

&lt;p&gt;Docker container includes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Application&lt;/li&gt;
&lt;li&gt;Libraries&lt;/li&gt;
&lt;li&gt;Dependencies&lt;/li&gt;
&lt;li&gt;Runtime&lt;/li&gt;
&lt;li&gt;Configuration&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Container can run consistently:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;laptop&lt;/li&gt;
&lt;li&gt;EC2&lt;/li&gt;
&lt;li&gt;ECS&lt;/li&gt;
&lt;li&gt;Kubernetes&lt;/li&gt;
&lt;li&gt;cloud&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 6 — Why Docker Alone Is Not Enough
&lt;/h1&gt;

&lt;p&gt;If you run only one container:&lt;/p&gt;

&lt;p&gt;Docker alone may be enough.&lt;/p&gt;

&lt;p&gt;But large systems need:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;scaling&lt;/li&gt;
&lt;li&gt;failover&lt;/li&gt;
&lt;li&gt;networking&lt;/li&gt;
&lt;li&gt;deployment automation&lt;/li&gt;
&lt;li&gt;service discovery&lt;/li&gt;
&lt;li&gt;self-healing&lt;/li&gt;
&lt;li&gt;orchestration&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is why Kubernetes and ECS exist.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 7 — ECS vs Kubernetes
&lt;/h1&gt;

&lt;h2&gt;
  
  
  ECS
&lt;/h2&gt;

&lt;p&gt;AWS-native container orchestrator.&lt;/p&gt;

&lt;p&gt;Simpler.&lt;/p&gt;

&lt;p&gt;Good integration with AWS.&lt;/p&gt;




&lt;h2&gt;
  
  
  Kubernetes
&lt;/h2&gt;

&lt;p&gt;Industry-standard orchestration platform.&lt;/p&gt;

&lt;p&gt;More powerful.&lt;/p&gt;

&lt;p&gt;More complex.&lt;/p&gt;

&lt;p&gt;Used heavily by large enterprises.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 8 — What Is ECS?
&lt;/h1&gt;

&lt;p&gt;ECS = Elastic Container Service.&lt;/p&gt;

&lt;p&gt;ECS does:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Runs containers&lt;/li&gt;
&lt;li&gt;Restarts failed containers&lt;/li&gt;
&lt;li&gt;Deploys applications&lt;/li&gt;
&lt;li&gt;Scales containers&lt;/li&gt;
&lt;li&gt;Handles networking&lt;/li&gt;
&lt;li&gt;Manages task lifecycle&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;ECS is NOT:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Load balancer&lt;/li&gt;
&lt;li&gt;Database&lt;/li&gt;
&lt;li&gt;Monitoring system&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 9 — What Is Fargate?
&lt;/h1&gt;

&lt;p&gt;Fargate is serverless container infrastructure.&lt;/p&gt;

&lt;p&gt;AWS manages:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;servers&lt;/li&gt;
&lt;li&gt;patching&lt;/li&gt;
&lt;li&gt;hypervisor&lt;/li&gt;
&lt;li&gt;scaling infrastructure&lt;/li&gt;
&lt;li&gt;hardware&lt;/li&gt;
&lt;li&gt;OS maintenance&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;You manage only:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;containers&lt;/li&gt;
&lt;li&gt;task definitions&lt;/li&gt;
&lt;li&gt;services&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 10 — ECS Cluster Does NOT Mean One Machine
&lt;/h1&gt;

&lt;p&gt;Important concept.&lt;/p&gt;

&lt;p&gt;ECS cluster is:&lt;/p&gt;

&lt;p&gt;Logical grouping for tasks/services.&lt;/p&gt;

&lt;p&gt;One ECS cluster can run:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;many tasks&lt;/li&gt;
&lt;li&gt;many services&lt;/li&gt;
&lt;li&gt;many applications&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;app-service&lt;/li&gt;
&lt;li&gt;grafana-service&lt;/li&gt;
&lt;li&gt;prometheus-service&lt;/li&gt;
&lt;li&gt;loki-service&lt;/li&gt;
&lt;li&gt;alloy-service&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;all inside ONE ECS cluster.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 11 — ECS Task Definition
&lt;/h1&gt;

&lt;p&gt;Task definition describes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;container image&lt;/li&gt;
&lt;li&gt;CPU&lt;/li&gt;
&lt;li&gt;memory&lt;/li&gt;
&lt;li&gt;ports&lt;/li&gt;
&lt;li&gt;environment variables&lt;/li&gt;
&lt;li&gt;IAM roles&lt;/li&gt;
&lt;li&gt;commands&lt;/li&gt;
&lt;li&gt;networking&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Task definition is like:&lt;/p&gt;

&lt;p&gt;Blueprint/template for containers.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 12 — ECS Service
&lt;/h1&gt;

&lt;p&gt;Service keeps tasks alive.&lt;/p&gt;

&lt;p&gt;If container crashes:&lt;/p&gt;

&lt;p&gt;ECS service recreates it automatically.&lt;/p&gt;

&lt;p&gt;This is real production behavior.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 13 — Load Balancer vs ECS
&lt;/h1&gt;

&lt;p&gt;Many beginners confuse this.&lt;/p&gt;

&lt;h2&gt;
  
  
  Load Balancer (ALB)
&lt;/h2&gt;

&lt;p&gt;ALB ONLY distributes traffic.&lt;/p&gt;

&lt;p&gt;ALB does NOT:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;run applications&lt;/li&gt;
&lt;li&gt;restart containers&lt;/li&gt;
&lt;li&gt;deploy apps&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;ALB routes traffic.&lt;/p&gt;




&lt;h2&gt;
  
  
  ECS
&lt;/h2&gt;

&lt;p&gt;ECS actually:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;launches containers&lt;/li&gt;
&lt;li&gt;restarts containers&lt;/li&gt;
&lt;li&gt;scales tasks&lt;/li&gt;
&lt;li&gt;deploys revisions&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 14 — Real Production Architecture
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Simple Architecture
&lt;/h2&gt;

&lt;p&gt;Users&lt;br&gt;
↓&lt;br&gt;
Public IP&lt;br&gt;
↓&lt;br&gt;
EC2&lt;br&gt;
↓&lt;br&gt;
Docker container&lt;/p&gt;

&lt;p&gt;Good for small applications.&lt;/p&gt;




&lt;h2&gt;
  
  
  Better Production Architecture
&lt;/h2&gt;

&lt;p&gt;Users&lt;br&gt;
↓&lt;br&gt;
CloudFront CDN&lt;br&gt;
↓&lt;br&gt;
ALB&lt;br&gt;
↓&lt;br&gt;
ECS Fargate&lt;br&gt;
↓&lt;br&gt;
Containers&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 15 — Why CloudFront Exists
&lt;/h1&gt;

&lt;p&gt;CloudFront is CDN.&lt;/p&gt;

&lt;p&gt;It distributes content globally.&lt;/p&gt;

&lt;p&gt;Without CDN:&lt;/p&gt;

&lt;p&gt;All users hit one region.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Only us-east-1.&lt;/p&gt;

&lt;p&gt;Users in Asia experience latency.&lt;/p&gt;

&lt;p&gt;CloudFront caches closer to users.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 16 — What Is SRE?
&lt;/h1&gt;

&lt;p&gt;SRE = Site Reliability Engineering.&lt;/p&gt;

&lt;p&gt;SRE focuses on:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;uptime&lt;/li&gt;
&lt;li&gt;monitoring&lt;/li&gt;
&lt;li&gt;reliability&lt;/li&gt;
&lt;li&gt;scaling&lt;/li&gt;
&lt;li&gt;observability&lt;/li&gt;
&lt;li&gt;alerting&lt;/li&gt;
&lt;li&gt;automation&lt;/li&gt;
&lt;li&gt;troubleshooting&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 17 — What Is Observability?
&lt;/h1&gt;

&lt;p&gt;Observability means:&lt;/p&gt;

&lt;p&gt;Understanding system behavior.&lt;/p&gt;

&lt;p&gt;Three major pillars:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Metrics&lt;/li&gt;
&lt;li&gt;Logs&lt;/li&gt;
&lt;li&gt;Traces&lt;/li&gt;
&lt;/ol&gt;




&lt;h1&gt;
  
  
  Part 18 — Metrics vs Logs
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Metrics
&lt;/h2&gt;

&lt;p&gt;Metrics answer:&lt;/p&gt;

&lt;p&gt;"WHAT is wrong?"&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU 90%&lt;/li&gt;
&lt;li&gt;Memory 85%&lt;/li&gt;
&lt;li&gt;Request latency&lt;/li&gt;
&lt;li&gt;Error rate&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Metrics are numerical time-series data.&lt;/p&gt;




&lt;h2&gt;
  
  
  Logs
&lt;/h2&gt;

&lt;p&gt;Logs answer:&lt;/p&gt;

&lt;p&gt;"WHY is it wrong?"&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;database timeout&lt;/li&gt;
&lt;li&gt;authentication failure&lt;/li&gt;
&lt;li&gt;stack trace&lt;/li&gt;
&lt;li&gt;nginx 500 error&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Logs are text.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 19 — Prometheus
&lt;/h1&gt;

&lt;p&gt;Prometheus stores metrics.&lt;/p&gt;

&lt;p&gt;Prometheus is:&lt;/p&gt;

&lt;p&gt;Time-series database.&lt;/p&gt;

&lt;p&gt;Prometheus stores:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU history&lt;/li&gt;
&lt;li&gt;Memory history&lt;/li&gt;
&lt;li&gt;Request history&lt;/li&gt;
&lt;li&gt;Error history&lt;/li&gt;
&lt;li&gt;Latency history&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Prometheus uses:&lt;/p&gt;

&lt;p&gt;PromQL.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 20 — Grafana
&lt;/h1&gt;

&lt;p&gt;Grafana visualizes telemetry.&lt;/p&gt;

&lt;p&gt;Grafana itself does NOT store:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;metrics&lt;/li&gt;
&lt;li&gt;logs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Grafana reads from:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Tempo&lt;/li&gt;
&lt;li&gt;CloudWatch&lt;/li&gt;
&lt;li&gt;Elasticsearch&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Grafana creates:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;dashboards&lt;/li&gt;
&lt;li&gt;alerts&lt;/li&gt;
&lt;li&gt;graphs&lt;/li&gt;
&lt;li&gt;log search&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 21 — Loki
&lt;/h1&gt;

&lt;p&gt;Loki stores logs centrally.&lt;/p&gt;

&lt;p&gt;Instead of:&lt;/p&gt;

&lt;p&gt;logging into every machine,&lt;/p&gt;

&lt;p&gt;Loki centralizes logs.&lt;/p&gt;

&lt;p&gt;All systems send logs into Loki.&lt;/p&gt;

&lt;p&gt;Grafana can search them.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 22 — Alloy
&lt;/h1&gt;

&lt;p&gt;Alloy is telemetry pipeline agent.&lt;/p&gt;

&lt;p&gt;Alloy can:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;collect metrics&lt;/li&gt;
&lt;li&gt;collect logs&lt;/li&gt;
&lt;li&gt;collect traces&lt;/li&gt;
&lt;li&gt;forward telemetry&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Alloy sends data to:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Tempo&lt;/li&gt;
&lt;li&gt;Grafana Cloud&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Important:&lt;/p&gt;

&lt;p&gt;Alloy is NOT main storage.&lt;/p&gt;

&lt;p&gt;Alloy transports telemetry.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 23 — Node Exporter
&lt;/h1&gt;

&lt;p&gt;Node Exporter exposes Linux host metrics.&lt;/p&gt;

&lt;p&gt;Examples:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU usage&lt;/li&gt;
&lt;li&gt;RAM usage&lt;/li&gt;
&lt;li&gt;Disk usage&lt;/li&gt;
&lt;li&gt;Filesystem metrics&lt;/li&gt;
&lt;li&gt;Network metrics&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Node Exporter produces metrics endpoint:&lt;/p&gt;

&lt;p&gt;/metrics&lt;/p&gt;

&lt;p&gt;Prometheus scrapes it.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 24 — Why Node Exporter Is Different In Fargate
&lt;/h1&gt;

&lt;p&gt;In EC2:&lt;/p&gt;

&lt;p&gt;You control Linux host.&lt;/p&gt;

&lt;p&gt;You can install Node Exporter.&lt;/p&gt;




&lt;p&gt;In Fargate:&lt;/p&gt;

&lt;p&gt;AWS hides:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;host OS&lt;/li&gt;
&lt;li&gt;kernel&lt;/li&gt;
&lt;li&gt;hardware&lt;/li&gt;
&lt;li&gt;hypervisor&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;So you cannot install Node Exporter on Fargate host.&lt;/p&gt;

&lt;p&gt;Instead we use:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;ECS telemetry&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;li&gt;OpenTelemetry&lt;/li&gt;
&lt;li&gt;CloudWatch metrics&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 25 — Sidecar Containers
&lt;/h1&gt;

&lt;p&gt;Sidecar means:&lt;/p&gt;

&lt;p&gt;Second container inside same task/pod.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;application container&lt;/li&gt;
&lt;li&gt;alloy sidecar&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why?&lt;/p&gt;

&lt;p&gt;Sidecar can:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;collect local logs&lt;/li&gt;
&lt;li&gt;collect metrics&lt;/li&gt;
&lt;li&gt;forward telemetry&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 26 — Why We Separate Services
&lt;/h1&gt;

&lt;p&gt;Real production architecture separates:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;application&lt;/li&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why?&lt;/p&gt;

&lt;p&gt;Different scaling requirements.&lt;/p&gt;

&lt;p&gt;Different CPU usage.&lt;/p&gt;

&lt;p&gt;Different memory usage.&lt;/p&gt;

&lt;p&gt;Avoid single point of failure.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 27 — Final Production Architecture
&lt;/h1&gt;

&lt;p&gt;Internet&lt;br&gt;
↓&lt;br&gt;
CloudFront&lt;br&gt;
↓&lt;br&gt;
ALB&lt;br&gt;
↓&lt;br&gt;
ECS Fargate Application&lt;br&gt;
↓&lt;br&gt;
Metrics → Prometheus&lt;br&gt;
↓&lt;br&gt;
Logs → Alloy → Loki&lt;br&gt;
↓&lt;br&gt;
Grafana dashboards&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 28 — IAM Roles In ECS
&lt;/h1&gt;

&lt;p&gt;Two important roles.&lt;/p&gt;

&lt;h2&gt;
  
  
  Task Execution Role
&lt;/h2&gt;

&lt;p&gt;Used by ECS infrastructure.&lt;/p&gt;

&lt;p&gt;Allows:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;pulling images&lt;/li&gt;
&lt;li&gt;CloudWatch logs&lt;/li&gt;
&lt;li&gt;ECS startup actions&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Task Role
&lt;/h2&gt;

&lt;p&gt;Used by application container.&lt;/p&gt;

&lt;p&gt;Allows application access to:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;S3&lt;/li&gt;
&lt;li&gt;DynamoDB&lt;/li&gt;
&lt;li&gt;Secrets Manager&lt;/li&gt;
&lt;li&gt;AWS APIs&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 29 — Security Groups
&lt;/h1&gt;

&lt;p&gt;Security Groups are virtual firewalls.&lt;/p&gt;

&lt;p&gt;They control:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;inbound traffic&lt;/li&gt;
&lt;li&gt;outbound traffic&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Allow:&lt;/p&gt;

&lt;p&gt;HTTP 80&lt;br&gt;
HTTPS 443&lt;br&gt;
Grafana 3000&lt;br&gt;
Prometheus 9090&lt;br&gt;
Loki 3100&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 30 — ECS Troubleshooting Learned In Lab
&lt;/h1&gt;

&lt;p&gt;Real troubleshooting scenarios encountered:&lt;/p&gt;

&lt;h2&gt;
  
  
  Image Pull Failure
&lt;/h2&gt;

&lt;p&gt;CannotPullContainerError&lt;br&gt;
403 Forbidden&lt;/p&gt;

&lt;p&gt;Cause:&lt;/p&gt;

&lt;p&gt;Private registry permissions.&lt;/p&gt;

&lt;p&gt;Fix:&lt;/p&gt;

&lt;p&gt;Use public container image.&lt;/p&gt;




&lt;h2&gt;
  
  
  Deployment Rollback
&lt;/h2&gt;

&lt;p&gt;ECS deployment rollback failed.&lt;/p&gt;

&lt;p&gt;Cause:&lt;/p&gt;

&lt;p&gt;Containers failing during deployment.&lt;/p&gt;




&lt;h2&gt;
  
  
  Alloy Command Parsing Issue
&lt;/h2&gt;

&lt;p&gt;Wrong command:&lt;/p&gt;

&lt;p&gt;run /etc/alloy/fargate.alloy&lt;/p&gt;

&lt;p&gt;Correct ECS array syntax:&lt;/p&gt;

&lt;p&gt;run,/etc/alloy/fargate.alloy&lt;/p&gt;




&lt;h2&gt;
  
  
  Missing Config File
&lt;/h2&gt;

&lt;p&gt;Alloy failed because:&lt;/p&gt;

&lt;p&gt;/etc/alloy/fargate.alloy&lt;/p&gt;

&lt;p&gt;was not mounted.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 31 — Real SRE Workflow
&lt;/h1&gt;

&lt;p&gt;Real workflow:&lt;/p&gt;

&lt;p&gt;Deploy&lt;br&gt;
↓&lt;br&gt;
Observe failure&lt;br&gt;
↓&lt;br&gt;
Read logs&lt;br&gt;
↓&lt;br&gt;
Find root cause&lt;br&gt;
↓&lt;br&gt;
Fix configuration&lt;br&gt;
↓&lt;br&gt;
Redeploy&lt;br&gt;
↓&lt;br&gt;
Validate telemetry&lt;/p&gt;

&lt;p&gt;This is real production engineering.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 32 — Why Metrics And Logs Together Matter
&lt;/h1&gt;

&lt;p&gt;Metrics tell:&lt;/p&gt;

&lt;p&gt;WHAT is wrong.&lt;/p&gt;

&lt;p&gt;Logs tell:&lt;/p&gt;

&lt;p&gt;WHY it is wrong.&lt;/p&gt;

&lt;p&gt;Example:&lt;/p&gt;

&lt;p&gt;Metrics:&lt;/p&gt;

&lt;p&gt;CPU 95%&lt;/p&gt;

&lt;p&gt;Logs:&lt;/p&gt;

&lt;p&gt;Database timeout causing retries.&lt;/p&gt;

&lt;p&gt;Together they explain outages.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 33 — Why Modern Systems Need Observability
&lt;/h1&gt;

&lt;p&gt;Modern systems are distributed.&lt;/p&gt;

&lt;p&gt;Many:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;containers&lt;/li&gt;
&lt;li&gt;services&lt;/li&gt;
&lt;li&gt;APIs&lt;/li&gt;
&lt;li&gt;databases&lt;/li&gt;
&lt;li&gt;networks&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Without observability:&lt;/p&gt;

&lt;p&gt;troubleshooting becomes impossible.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 34 — What Students Learned In This Lab
&lt;/h1&gt;

&lt;p&gt;Students learned:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;ECS&lt;/li&gt;
&lt;li&gt;Fargate&lt;/li&gt;
&lt;li&gt;Task Definitions&lt;/li&gt;
&lt;li&gt;Services&lt;/li&gt;
&lt;li&gt;IAM Roles&lt;/li&gt;
&lt;li&gt;Security Groups&lt;/li&gt;
&lt;li&gt;CloudWatch Logs&lt;/li&gt;
&lt;li&gt;Deployment failures&lt;/li&gt;
&lt;li&gt;Rollbacks&lt;/li&gt;
&lt;li&gt;Container troubleshooting&lt;/li&gt;
&lt;li&gt;Sidecars&lt;/li&gt;
&lt;li&gt;Metrics vs logs&lt;/li&gt;
&lt;li&gt;Observability&lt;/li&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;li&gt;Loki&lt;/li&gt;
&lt;li&gt;Alloy&lt;/li&gt;
&lt;li&gt;Distributed systems thinking&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Part 35 — Final Important Concepts
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Docker
&lt;/h2&gt;

&lt;p&gt;Packages application.&lt;/p&gt;




&lt;h2&gt;
  
  
  ECS/Kubernetes
&lt;/h2&gt;

&lt;p&gt;Runs applications reliably.&lt;/p&gt;




&lt;h2&gt;
  
  
  ALB
&lt;/h2&gt;

&lt;p&gt;Routes traffic.&lt;/p&gt;




&lt;h2&gt;
  
  
  CloudFront
&lt;/h2&gt;

&lt;p&gt;Distributes globally.&lt;/p&gt;




&lt;h2&gt;
  
  
  Prometheus
&lt;/h2&gt;

&lt;p&gt;Stores metrics.&lt;/p&gt;




&lt;h2&gt;
  
  
  Loki
&lt;/h2&gt;

&lt;p&gt;Stores logs.&lt;/p&gt;




&lt;h2&gt;
  
  
  Grafana
&lt;/h2&gt;

&lt;p&gt;Visualizes telemetry.&lt;/p&gt;




&lt;h2&gt;
  
  
  Alloy
&lt;/h2&gt;

&lt;p&gt;Collects/transports telemetry.&lt;/p&gt;




&lt;h2&gt;
  
  
  Node Exporter
&lt;/h2&gt;

&lt;p&gt;Produces Linux host metrics.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 36 — Enterprise SRE Mindset
&lt;/h1&gt;

&lt;p&gt;Modern SRE engineers think about:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;scalability&lt;/li&gt;
&lt;li&gt;observability&lt;/li&gt;
&lt;li&gt;automation&lt;/li&gt;
&lt;li&gt;reliability&lt;/li&gt;
&lt;li&gt;distributed systems&lt;/li&gt;
&lt;li&gt;telemetry&lt;/li&gt;
&lt;li&gt;infrastructure&lt;/li&gt;
&lt;li&gt;deployment safety&lt;/li&gt;
&lt;li&gt;failure recovery&lt;/li&gt;
&lt;li&gt;centralized monitoring&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is the foundation of modern cloud-native engineering.&lt;/p&gt;

</description>
    </item>
    <item>
      <title>Production Lab: ECS Fargate + Prometheus + Grafana + Loki + Alloy + Node Exporter</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Mon, 25 May 2026 05:17:12 +0000</pubDate>
      <link>https://dev.to/jumptotech/production-lab-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-10a</link>
      <guid>https://dev.to/jumptotech/production-lab-ecs-fargate-prometheus-grafana-loki-alloy-node-exporter-10a</guid>
      <description>&lt;h2&gt;
  
  
  Goal
&lt;/h2&gt;

&lt;p&gt;You will build this architecture:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS Fargate Application
   |
   | metrics/logs
   v
Alloy sidecar
   |
   | remote_write metrics
   | push logs
   v
EC2 Monitoring Server
   - Prometheus :9090
   - Grafana    :3000
   - Loki       :3100
   - Alloy
   - Node Exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Officially, ECS Fargate tasks use &lt;strong&gt;task execution roles&lt;/strong&gt; for ECS actions like pulling images/logging, and &lt;strong&gt;task roles&lt;/strong&gt; for application AWS permissions. (&lt;a href="https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html?utm_source=chatgpt.com" rel="noopener noreferrer"&gt;AWS Documentation&lt;/a&gt;) Alloy supports ECS/Fargate container metrics using the ECS Task Metadata Endpoint v4 and should run as a sidecar inside the task. (&lt;a href="https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.receiver.awsecscontainermetrics/?utm_source=chatgpt.com" rel="noopener noreferrer"&gt;Grafana Labs&lt;/a&gt;)&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 1: What Each Tool Does
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;What it does&lt;/th&gt;
&lt;th&gt;Why DevOps/SRE uses it&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;ECS&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Runs containers on AWS&lt;/td&gt;
&lt;td&gt;Deploy microservices&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Fargate&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Serverless container runtime&lt;/td&gt;
&lt;td&gt;No EC2 patching/management&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;IAM Role&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Gives permission securely&lt;/td&gt;
&lt;td&gt;No hardcoded AWS keys&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Prometheus&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Stores metrics&lt;/td&gt;
&lt;td&gt;CPU, memory, request rate, errors&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Grafana&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Visual dashboard&lt;/td&gt;
&lt;td&gt;See health visually&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Loki&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Stores logs&lt;/td&gt;
&lt;td&gt;Troubleshoot errors&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Alloy&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Collects metrics/logs/traces&lt;/td&gt;
&lt;td&gt;Modern agent replacing many old agents&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Node Exporter&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Exposes EC2 Linux metrics&lt;/td&gt;
&lt;td&gt;Monitor EC2 server health&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h1&gt;
  
  
  Part 2: EC2 Monitoring Server Check
&lt;/h1&gt;

&lt;p&gt;Your EC2 already has:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus
Grafana
Node Exporter
Loki
Alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Step 1: Check all services
&lt;/h2&gt;

&lt;p&gt;Run on EC2:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status prometheus
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status grafana-server
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status loki
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status alloy
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Why we check this
&lt;/h2&gt;

&lt;p&gt;Before we connect ECS, the central monitoring server must be healthy.&lt;/p&gt;

&lt;h2&gt;
  
  
  SRE/DevOps checks
&lt;/h2&gt;

&lt;p&gt;DevOps checks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;ss &lt;span class="nt"&gt;-tulnp&lt;/span&gt; | &lt;span class="nb"&gt;grep&lt;/span&gt; &lt;span class="nt"&gt;-E&lt;/span&gt; &lt;span class="s1"&gt;'3000|9090|9100|3100'&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected ports:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;3000 Grafana
9090 Prometheus
9100 Node Exporter
3100 Loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;SRE checks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;curl http://localhost:9090/-/ready
curl http://localhost:3100/ready
curl http://localhost:9100/metrics
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus ready
Loki ready
Node metrics visible
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 3: Fix Prometheus for Remote Write
&lt;/h1&gt;

&lt;p&gt;Fargate tasks are dynamic. Their private IP changes. So instead of Prometheus scraping every task IP, Alloy inside Fargate will &lt;strong&gt;push metrics&lt;/strong&gt; to Prometheus.&lt;/p&gt;

&lt;h2&gt;
  
  
  Step 2: Enable Prometheus remote write receiver
&lt;/h2&gt;

&lt;p&gt;Open Prometheus service file:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl edit prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Add:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight ini"&gt;&lt;code&gt;&lt;span class="nn"&gt;[Service]&lt;/span&gt;
&lt;span class="py"&gt;ExecStart&lt;/span&gt;&lt;span class="p"&gt;=&lt;/span&gt;
&lt;span class="py"&gt;ExecStart&lt;/span&gt;&lt;span class="p"&gt;=&lt;/span&gt;&lt;span class="s"&gt;/usr/local/bin/prometheus &lt;/span&gt;&lt;span class="se"&gt;\
&lt;/span&gt;  &lt;span class="s"&gt;--config.file=/etc/prometheus/prometheus.yml &lt;/span&gt;&lt;span class="se"&gt;\
&lt;/span&gt;  &lt;span class="s"&gt;--storage.tsdb.path=/var/lib/prometheus &lt;/span&gt;&lt;span class="se"&gt;\
&lt;/span&gt;  &lt;span class="s"&gt;--web.listen-address=:9090 &lt;/span&gt;&lt;span class="se"&gt;\
&lt;/span&gt;  &lt;span class="s"&gt;--web.enable-lifecycle &lt;/span&gt;&lt;span class="se"&gt;\
&lt;/span&gt;  &lt;span class="s"&gt;--web.enable-remote-write-receiver&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Restart:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl daemon-reload
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl restart prometheus
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Test:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;curl http://localhost:9090/-/ready
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Why we do this
&lt;/h2&gt;

&lt;p&gt;Fargate containers cannot easily be scraped by fixed IP because tasks start/stop dynamically. Remote write lets Alloy push metrics to Prometheus.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 4: EC2 Security Group
&lt;/h1&gt;

&lt;p&gt;In AWS Console:&lt;/p&gt;

&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 → Instances → Select monitoring EC2 → Security → Security Group
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Add inbound rules:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Port&lt;/th&gt;
&lt;th&gt;Source&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;3000&lt;/td&gt;
&lt;td&gt;Your IP only&lt;/td&gt;
&lt;td&gt;Grafana UI&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;9090&lt;/td&gt;
&lt;td&gt;VPC CIDR only&lt;/td&gt;
&lt;td&gt;Prometheus remote write&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;3100&lt;/td&gt;
&lt;td&gt;VPC CIDR only&lt;/td&gt;
&lt;td&gt;Loki logs&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;9100&lt;/td&gt;
&lt;td&gt;Your IP or VPC only&lt;/td&gt;
&lt;td&gt;Node Exporter test only&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Example VPC CIDR:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;10.0.0.0/16
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Do not open 9090, 3100, 9100 to &lt;code&gt;0.0.0.0/0&lt;/code&gt;.&lt;/p&gt;

&lt;h2&gt;
  
  
  Why we do this
&lt;/h2&gt;

&lt;p&gt;Prometheus and Loki do not protect themselves like a public website. Keep them private.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 5: Configure Alloy on EC2
&lt;/h1&gt;

&lt;p&gt;Open:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;nano /etc/alloy/config.alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Use this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus.exporter.unix "local_host" {
  set_collectors = ["cpu", "meminfo", "diskstats", "filesystem", "netdev", "loadavg"]
}

prometheus.scrape "local_host" {
  targets    = prometheus.exporter.unix.local_host.targets
  forward_to = [prometheus.remote_write.local_prom.receiver]
}

prometheus.remote_write "local_prom" {
  endpoint {
    url = "http://127.0.0.1:9090/api/v1/write"
  }
}

loki.source.file "system_logs" {
  targets = [
    {__path__ = "/var/log/syslog", job = "syslog"},
    {__path__ = "/var/log/auth.log", job = "auth"},
    {__path__ = "/var/log/nginx/access.log", job = "nginx_access"},
    {__path__ = "/var/log/nginx/error.log", job = "nginx_error"},
  ]
  forward_to = [loki.write.local_loki.receiver]
}

loki.write "local_loki" {
  endpoint {
    url = "http://127.0.0.1:3100/loki/api/v1/push"
  }
}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Restart:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;sudo &lt;/span&gt;alloy &lt;span class="nb"&gt;fmt&lt;/span&gt; &lt;span class="nt"&gt;--write&lt;/span&gt; /etc/alloy/config.alloy
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl restart alloy
&lt;span class="nb"&gt;sudo &lt;/span&gt;systemctl status alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Important correction
&lt;/h2&gt;

&lt;p&gt;Use:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;127.0.0.1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Not:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;123.0.0.1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 6: Create ECS IAM Roles
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Role 1: ECS Task Execution Role
&lt;/h2&gt;

&lt;p&gt;AWS Console:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;IAM → Roles → Create role → AWS service → Elastic Container Service → ECS Task
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Attach:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;AmazonECSTaskExecutionRolePolicy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecsTaskExecutionRole
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Why
&lt;/h2&gt;

&lt;p&gt;This allows ECS/Fargate to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Pull image from ECR
Send logs to CloudWatch
Read Secrets Manager if needed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Role 2: ECS Task Role
&lt;/h2&gt;

&lt;p&gt;Create another role:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;IAM → Roles → Create role → ECS Task
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecsAppTaskRole
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For this lab, start with no extra permissions.&lt;/p&gt;

&lt;p&gt;If app needs S3 later, add only exact S3 permissions.&lt;/p&gt;

&lt;h2&gt;
  
  
  Why
&lt;/h2&gt;

&lt;p&gt;Task role is for your application container, not ECS itself.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 7: Create ECS Cluster
&lt;/h1&gt;

&lt;p&gt;AWS Console:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → Clusters → Create cluster
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Choose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;AWS Fargate
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Name:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prod-observability-cluster
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Create
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Why
&lt;/h2&gt;

&lt;p&gt;Cluster is the logical place where ECS services/tasks run.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 8: Create Simple Application Container
&lt;/h1&gt;

&lt;p&gt;For easiest lab, use a demo app that exposes Prometheus metrics on port &lt;code&gt;8080&lt;/code&gt;.&lt;/p&gt;

&lt;p&gt;Example image:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ghcr.io/brancz/prometheus-example-app:v0.5.0
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;It exposes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;/metrics
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Port:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;8080
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 9: Create Fargate Task Definition
&lt;/h1&gt;

&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → Task Definitions → Create new task definition → Create new task definition with JSON
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Use this template:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight json"&gt;&lt;code&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"family"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"fargate-observability-lab"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"networkMode"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"awsvpc"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"requiresCompatibilities"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="s2"&gt;"FARGATE"&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"cpu"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"512"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"memory"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"1024"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"executionRoleArn"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"arn:aws:iam::&amp;lt;ACCOUNT_ID&amp;gt;:role/ecsTaskExecutionRole"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"taskRoleArn"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"arn:aws:iam::&amp;lt;ACCOUNT_ID&amp;gt;:role/ecsAppTaskRole"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"containerDefinitions"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"name"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"demo-app"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"image"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ghcr.io/brancz/prometheus-example-app:v0.5.0"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"essential"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="kc"&gt;true&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"portMappings"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"containerPort"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="mi"&gt;8080&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"protocol"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"tcp"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"logConfiguration"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="nl"&gt;"logDriver"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"awslogs"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="nl"&gt;"options"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-group"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"/ecs/fargate-observability-lab"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-region"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"us-east-2"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-stream-prefix"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"demo-app"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-create-group"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"true"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"name"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"alloy-sidecar"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"image"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"grafana/alloy:latest"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"essential"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="kc"&gt;false&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"command"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="s2"&gt;"run"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="s2"&gt;"--server.http.listen-addr=0.0.0.0:12345"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="s2"&gt;"/etc/alloy/fargate.alloy"&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"environment"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"name"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ALLOY_STABILITY_LEVEL"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"value"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"experimental"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"name"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"EC2_PROMETHEUS_URL"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"value"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://&amp;lt;EC2_PRIVATE_IP&amp;gt;:9090/api/v1/write"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"name"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"EC2_LOKI_URL"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"value"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://&amp;lt;EC2_PRIVATE_IP&amp;gt;:3100/loki/api/v1/push"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"portMappings"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"containerPort"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="mi"&gt;12345&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"protocol"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"tcp"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"logConfiguration"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="nl"&gt;"logDriver"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"awslogs"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="nl"&gt;"options"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-group"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"/ecs/fargate-observability-lab"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-region"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"us-east-2"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-stream-prefix"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"alloy"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
          &lt;/span&gt;&lt;span class="nl"&gt;"awslogs-create-group"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"true"&lt;/span&gt;&lt;span class="w"&gt;
        &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Replace:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;&amp;lt;ACCOUNT_ID&amp;gt;
&amp;lt;EC2_PRIVATE_IP&amp;gt;
us-east-2 if your region is different
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Important note
&lt;/h2&gt;

&lt;p&gt;For a real production setup, store Alloy config in:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EFS
S3 pulled at startup
custom Alloy image
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For class/demo, custom Alloy image is easiest.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 10: Alloy Fargate Config
&lt;/h1&gt;

&lt;p&gt;Create file:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;fargate.alloy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Content:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;prometheus.scrape "app_metrics" {
  targets = [
    {"__address__" = "127.0.0.1:8080", "job" = "demo-app"}
  ]

  forward_to = [prometheus.remote_write.ec2_prometheus.receiver]
}

otelcol.receiver.awsecscontainermetrics "fargate_metrics" {
  collection_interval = "30s"

  output {
    metrics = [otelcol.exporter.prometheus.fargate_to_prom.receiver]
  }
}

otelcol.exporter.prometheus "fargate_to_prom" {
  forward_to = [prometheus.remote_write.ec2_prometheus.receiver]
}

prometheus.remote_write "ec2_prometheus" {
  endpoint {
    url = env("EC2_PROMETHEUS_URL")
  }
}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Why
&lt;/h2&gt;

&lt;p&gt;This collects:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Application /metrics
Fargate task CPU
Fargate task memory
Container-level metrics
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 11: Run ECS Service
&lt;/h1&gt;

&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → Clusters → prod-observability-cluster → Services → Create
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Choose:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Launch type: Fargate
Task definition: fargate-observability-lab
Service name: demo-app-service
Desired tasks: 1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Networking:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;VPC: same VPC as EC2 monitoring server
Subnets: private subnets preferred
Security group: allow outbound to EC2 private IP ports 9090 and 3100
Public IP: disabled if private subnet has NAT
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Create
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  What to check
&lt;/h2&gt;

&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → Cluster → Service → Tasks
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Task status: Running
Containers: demo-app running, alloy-sidecar running
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 12: Verify in Prometheus
&lt;/h1&gt;

&lt;p&gt;Open:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;http://&amp;lt;EC2_PUBLIC_IP&amp;gt;:9090
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Status → TSDB Status
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Then search in Graph:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;up
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Check Alloy internal metrics:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;alloy_component_controller_running_components
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Check EC2 CPU:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;rate(node_cpu_seconds_total[5m])
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Check EC2 memory:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;node_memory_MemAvailable_bytes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Check app request metrics:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;http_requests_total
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Check Fargate container metrics:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecs_task_memory_utilized
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;or:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;container_memory_usage_bytes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Metric names may vary depending on Alloy/OpenTelemetry conversion.&lt;/p&gt;




&lt;h1&gt;
  
  
  Part 13: Verify in Grafana
&lt;/h1&gt;

&lt;p&gt;Open:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;http://&amp;lt;EC2_PUBLIC_IP&amp;gt;:3000
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Connections → Data sources
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Add Prometheus:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;URL: http://localhost:9090
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Add Loki:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;URL: http://localhost:3100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Save &amp;amp; test
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Expected:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Data source is working
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 14: Grafana Explore Queries
&lt;/h1&gt;

&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Grafana → Explore → Prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Use:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;up
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;rate(node_network_receive_bytes_total[1m])
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;rate(http_requests_total[5m])
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Go to:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Grafana → Explore → Loki
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Use:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="auth"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;{job="nginx_access"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For ECS logs, first check CloudWatch logs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;CloudWatch → Log groups → /ecs/fargate-observability-lab
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 15: What SRE Must Monitor
&lt;/h1&gt;

&lt;h2&gt;
  
  
  1. EC2 monitoring server health
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Alert if:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Memory &amp;gt; 85%
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;If monitoring server dies, you lose visibility.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  2. Disk usage
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - ((node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"})
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Alert if:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Disk &amp;gt; 80%
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Prometheus and Loki can fill disk quickly.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  3. Fargate task memory
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ecs_task_memory_utilized / ecs_task_memory_reserved * 100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Alert if:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;&amp;gt; 85% for 3 minutes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Fargate kills containers when memory limit is reached.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  4. Application request rate
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;sum(rate(http_requests_total[5m]))
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;If traffic drops to zero, app or routing may be broken.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  5. Error rate
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;sum(rate(http_requests_total{code=~"5.."}[5m]))
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Why:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;5xx errors show application or dependency failure.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 16: What DevOps Must Check
&lt;/h1&gt;

&lt;p&gt;DevOps engineer checks:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. IAM roles are correct
2. ECS task is running
3. Security groups allow only needed ports
4. Fargate can reach EC2 private IP
5. Prometheus remote write is enabled
6. Loki is receiving logs
7. Grafana data sources work
8. No public access to Prometheus/Loki/Node Exporter
9. ECS service has desired count = running count
10. CloudWatch logs exist for both containers
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Part 17: Troubleshooting
&lt;/h1&gt;

&lt;h2&gt;
  
  
  Problem: ECS task running but no metrics
&lt;/h2&gt;

&lt;p&gt;Check Alloy logs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS → Task → alloy-sidecar → Logs
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Look for:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;connection refused
timeout
remote write failed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Common causes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;EC2 security group blocks port 9090
Wrong EC2 private IP
Prometheus remote write receiver not enabled
Alloy config error
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Problem: Grafana shows no Loki logs
&lt;/h2&gt;

&lt;p&gt;Check:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;curl http://localhost:3100/ready
&lt;span class="nb"&gt;sudo &lt;/span&gt;journalctl &lt;span class="nt"&gt;-u&lt;/span&gt; alloy &lt;span class="nt"&gt;-f&lt;/span&gt;
&lt;span class="nb"&gt;sudo &lt;/span&gt;journalctl &lt;span class="nt"&gt;-u&lt;/span&gt; loki &lt;span class="nt"&gt;-f&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Common causes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Loki not running
Wrong Loki URL
Alloy cannot read log files
No permissions on /var/log/*
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Problem: Node Exporter works but Fargate metrics missing
&lt;/h2&gt;

&lt;p&gt;Cause:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Node Exporter monitors EC2 only.
It cannot monitor Fargate hosts.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Correct approach:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Use Alloy sidecar with ECS container metrics receiver.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Final Teaching Summary
&lt;/h1&gt;

&lt;p&gt;This lab demonstrates a real DevOps/SRE production pattern:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;ECS Fargate runs application containers.
IAM secures container permissions.
Alloy collects telemetry.
Prometheus stores metrics.
Loki stores logs.
Grafana visualizes everything.
Node Exporter monitors the EC2 monitoring server.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The most important SRE mindset:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Metrics tell you what is happening.
Logs tell you why it happened.
Grafana helps you see the story.
IAM and security groups control who can access what.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
      <category>aws</category>
      <category>devops</category>
      <category>monitoring</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Full Lecture — Grafana Loki + Grafana Alloy for DevOps &amp; SRE Engineers</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Fri, 22 May 2026 14:17:49 +0000</pubDate>
      <link>https://dev.to/jumptotech/full-lecture-grafana-loki-grafana-alloy-for-devops-sre-engineers-2f53</link>
      <guid>https://dev.to/jumptotech/full-lecture-grafana-loki-grafana-alloy-for-devops-sre-engineers-2f53</guid>
      <description>&lt;h2&gt;
  
  
  What You Already Built
&lt;/h2&gt;

&lt;p&gt;You already built a REAL observability platform:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="4ck4gm"&lt;br&gt;
Nginx&lt;br&gt;
   ↓&lt;br&gt;
Access/Error Logs&lt;br&gt;
   ↓&lt;br&gt;
Grafana Alloy&lt;br&gt;
   ↓&lt;br&gt;
Grafana Loki&lt;br&gt;
   ↓&lt;br&gt;
Grafana&lt;br&gt;
   ↓&lt;br&gt;
SRE Engineer&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


This is extremely close to what companies use in production.

You now have:

* centralized logs
* log querying
* observability
* troubleshooting platform
* real SRE workflow

---

# What Is Observability?

Observability means:



```text id="3z8t0x"
Understanding what is happening inside systems
by using:
- metrics
- logs
- traces
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Three pillars:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Pillar&lt;/th&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Metrics&lt;/td&gt;
&lt;td&gt;Prometheus&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Logs&lt;/td&gt;
&lt;td&gt;Loki&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Traces&lt;/td&gt;
&lt;td&gt;Tempo/OpenTelemetry&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Without observability:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;engineers guess&lt;/li&gt;
&lt;li&gt;outages take hours&lt;/li&gt;
&lt;li&gt;root cause unknown&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;With observability:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;engineers detect incidents fast&lt;/li&gt;
&lt;li&gt;correlate failures&lt;/li&gt;
&lt;li&gt;reduce downtime&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  What Is Loki?
&lt;/h1&gt;

&lt;p&gt;Grafana Loki is a centralized log storage system.&lt;/p&gt;

&lt;p&gt;It stores:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;application logs&lt;/li&gt;
&lt;li&gt;nginx logs&lt;/li&gt;
&lt;li&gt;Kubernetes logs&lt;/li&gt;
&lt;li&gt;Docker logs&lt;/li&gt;
&lt;li&gt;Linux logs&lt;/li&gt;
&lt;li&gt;authentication logs&lt;/li&gt;
&lt;li&gt;API logs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Instead of SSHing into 100 servers:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="42phlg"&lt;br&gt;
cat /var/log/nginx/access.log&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You centralize everything into Loki.

---

# Why Companies Use Loki

Before centralized logging:



```text id="lkgrvg"
Server1 logs
Server2 logs
Server3 logs
Kubernetes pod logs
Docker logs
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Impossible to troubleshoot quickly.&lt;/p&gt;

&lt;p&gt;With Loki:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="f75qsi"&lt;br&gt;
All logs centralized&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Engineers search:



```logql id="nhv1ya"
{job="nginx"} |= "500"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Meaning:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;show nginx errors&lt;/li&gt;
&lt;li&gt;across all infrastructure&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  What Is Alloy?
&lt;/h1&gt;

&lt;p&gt;Grafana Alloy is the collector.&lt;/p&gt;

&lt;p&gt;VERY IMPORTANT:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="8k6r92"&lt;br&gt;
Alloy DOES NOT STORE LOGS&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


It:

* reads
* collects
* forwards

Think of Alloy like:



```text id="g8m0pl"
Log shipping agent
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Alloy:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;reads files&lt;/li&gt;
&lt;li&gt;watches logs&lt;/li&gt;
&lt;li&gt;sends logs to Loki&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Real Production Pipeline
&lt;/h1&gt;



&lt;p&gt;```text id="yq3h3f"&lt;br&gt;
Application&lt;br&gt;
   ↓&lt;br&gt;
Log file created&lt;br&gt;
   ↓&lt;br&gt;
Alloy watches file&lt;br&gt;
   ↓&lt;br&gt;
Alloy ships logs&lt;br&gt;
   ↓&lt;br&gt;
Loki stores logs&lt;br&gt;
   ↓&lt;br&gt;
Grafana visualizes logs&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Why SRE Engineers Need Loki + Alloy

As SRE engineer, your job is:

| Responsibility          | Why Logs Matter       |
| ----------------------- | --------------------- |
| Incident response       | identify failures     |
| Root cause analysis     | determine WHY         |
| Security investigations | detect attacks        |
| Performance debugging   | trace slow systems    |
| Compliance              | audit logs            |
| Kubernetes debugging    | inspect pod failures  |
| API debugging           | analyze requests      |
| Authentication issues   | detect login failures |

---

# Difference Between Metrics And Logs

## Metrics

Metrics answer:



```text id="01t1wc"
WHAT is wrong?
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="qg1ig0"&lt;br&gt;
CPU = 95%&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


## Logs

Logs answer:



```text id="jlwmws"
WHY is it wrong?
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="m2gd06"&lt;br&gt;
database connection timeout&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Example Real Incident

Users say:



```text id="y6m7r4"
Website slow
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;h2&gt;
  
  
  Prometheus shows:
&lt;/h2&gt;



&lt;p&gt;```text id="nvvjlwm"&lt;br&gt;
CPU high&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


## Loki logs show:



```text id="1g9byc"
database timeout
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Root cause found.&lt;/p&gt;

&lt;p&gt;THIS is real SRE workflow.&lt;/p&gt;


&lt;h1&gt;
  
  
  Why Loki Became Popular
&lt;/h1&gt;

&lt;p&gt;Compared to ELK stack:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;ELK&lt;/th&gt;
&lt;th&gt;Loki&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Heavy&lt;/td&gt;
&lt;td&gt;Lightweight&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Expensive indexing&lt;/td&gt;
&lt;td&gt;Label-based&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;High RAM&lt;/td&gt;
&lt;td&gt;Lower RAM&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Complex&lt;/td&gt;
&lt;td&gt;Simpler&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Expensive storage&lt;/td&gt;
&lt;td&gt;Cheaper&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Loki indexes labels only.&lt;/p&gt;

&lt;p&gt;VERY important.&lt;/p&gt;


&lt;h1&gt;
  
  
  MOST IMPORTANT LOKI CONCEPT — Labels
&lt;/h1&gt;

&lt;p&gt;Labels organize logs.&lt;/p&gt;

&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="dms9zl"&lt;br&gt;
job="nginx"&lt;br&gt;
env="prod"&lt;br&gt;
instance="server1"&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Labels make logs searchable.

Example query:



```logql id="1q5pw2"
{job="nginx"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  VERY IMPORTANT SRE KNOWLEDGE
&lt;/h1&gt;

&lt;p&gt;BAD labels destroy Loki performance.&lt;/p&gt;

&lt;p&gt;NEVER use dynamic labels like:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;request_id&lt;/li&gt;
&lt;li&gt;session_id&lt;/li&gt;
&lt;li&gt;timestamp&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why?&lt;/p&gt;

&lt;p&gt;Because Loki creates massive indexes.&lt;/p&gt;

&lt;p&gt;This is a VERY common interview question.&lt;/p&gt;


&lt;h1&gt;
  
  
  Understanding Your Current Environment
&lt;/h1&gt;

&lt;p&gt;Your Alloy config:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="87v60n"&lt;br&gt;
Reads nginx access.log&lt;br&gt;
Reads nginx error.log&lt;br&gt;
Reads syslog&lt;br&gt;
Reads auth.log&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Then forwards to Loki.

Your Grafana query:



```logql id="b6pv2o"
{job="nginx_access"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;returns nginx traffic logs.&lt;/p&gt;

&lt;p&gt;That means:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;pipeline works&lt;/li&gt;
&lt;li&gt;observability works&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Understanding Nginx Logs
&lt;/h1&gt;

&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="khyfvg"&lt;br&gt;
"GET / HTTP/1.1" 200&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


| Part        | Meaning        |
| ----------- | -------------- |
| GET         | HTTP method    |
| /           | requested page |
| HTTP/1.1    | protocol       |
| 200         | success        |
| curl/8.18.0 | client         |

---

# HTTP Status Codes SRE Must Know

| Code | Meaning               |
| ---- | --------------------- |
| 200  | success               |
| 301  | redirect              |
| 403  | forbidden             |
| 404  | not found             |
| 500  | internal server error |
| 502  | bad gateway           |
| 503  | service unavailable   |

---

# What SRE Engineers Watch In Logs

## 1. 500 Errors



```logql id="70rmjf"
{job="nginx_access"} |~ "5[0-9][0-9]"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Detect:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;backend crashes&lt;/li&gt;
&lt;li&gt;application failures&lt;/li&gt;
&lt;/ul&gt;


&lt;h2&gt;
  
  
  2. 404 Errors
&lt;/h2&gt;



&lt;p&gt;```logql id="b7v1yz"&lt;br&gt;
{job="nginx_access"} |= "404"&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Detect:

* broken pages
* scanners
* attacks

---

## 3. Authentication Failures



```logql id="a5w3iy"
{job="auth"} |= "Failed password"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Detect:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;brute force attacks&lt;/li&gt;
&lt;li&gt;credential failures&lt;/li&gt;
&lt;/ul&gt;


&lt;h2&gt;
  
  
  4. Kubernetes Pod Crashes
&lt;/h2&gt;



&lt;p&gt;```logql id="qvqihn"&lt;br&gt;
{kubernetes_namespace="prod"} |= "CrashLoopBackOff"&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

## 5. Database Failures



```logql id="sl7z7h"
{job="backend"} |= "connection timeout"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  What Alloy Can Collect
&lt;/h1&gt;

&lt;p&gt;Alloy can collect:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Source&lt;/th&gt;
&lt;th&gt;Example&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Linux logs&lt;/td&gt;
&lt;td&gt;syslog&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Auth logs&lt;/td&gt;
&lt;td&gt;auth.log&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Nginx logs&lt;/td&gt;
&lt;td&gt;access.log&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Docker logs&lt;/td&gt;
&lt;td&gt;containers&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Kubernetes logs&lt;/td&gt;
&lt;td&gt;pods&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Journald&lt;/td&gt;
&lt;td&gt;systemd&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;CloudWatch&lt;/td&gt;
&lt;td&gt;AWS&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;APIs&lt;/td&gt;
&lt;td&gt;telemetry&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;OpenTelemetry&lt;/td&gt;
&lt;td&gt;traces&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  Why Alloy Is Important
&lt;/h1&gt;

&lt;p&gt;Before Alloy:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Promtail for logs&lt;/li&gt;
&lt;li&gt;Grafana Agent for metrics&lt;/li&gt;
&lt;li&gt;OpenTelemetry Collector for traces&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Now:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Alloy combines all&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Very important modern observability concept.&lt;/p&gt;


&lt;h1&gt;
  
  
  Difference Between Loki And Prometheus
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Loki&lt;/th&gt;
&lt;th&gt;Prometheus&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;logs&lt;/td&gt;
&lt;td&gt;metrics&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;text&lt;/td&gt;
&lt;td&gt;numeric&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;debugging&lt;/td&gt;
&lt;td&gt;monitoring&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;WHY&lt;/td&gt;
&lt;td&gt;WHAT&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  Understanding Your Real Pipeline
&lt;/h1&gt;

&lt;p&gt;You successfully built:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="8n14qv"&lt;br&gt;
curl request&lt;br&gt;
   ↓&lt;br&gt;
Nginx access.log&lt;br&gt;
   ↓&lt;br&gt;
Alloy reads file&lt;br&gt;
   ↓&lt;br&gt;
Alloy forwards logs&lt;br&gt;
   ↓&lt;br&gt;
Loki stores logs&lt;br&gt;
   ↓&lt;br&gt;
Grafana queries logs&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


This is REAL observability engineering.

---

# Real Production SRE Workflow

## Incident Starts

Users complain:

* website slow
* login failing
* API timeout

---

## SRE Workflow

### Step 1 — Metrics

Check:

* CPU
* memory
* disk
* latency

---

### Step 2 — Logs

Search:



```logql id="u5u2c9"
{job="nginx_access"} |= "500"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Step 3 — Correlate
&lt;/h3&gt;

&lt;p&gt;Search backend:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```logql id="b7rwk7"&lt;br&gt;
{job="backend"} |= "database timeout"&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

### Step 4 — Root Cause

Database overloaded.

---

# MOST IMPORTANT SRE SKILL

Correlation.

Example:



```text id="7n8j39"
High CPU
+
Timeout logs
+
Restart events
=
Root cause
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  FULL HANDS-ON LABS
&lt;/h1&gt;
&lt;h1&gt;
  
  
  LAB 1 — Generate Traffic
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="ejt8x6"&lt;br&gt;
for i in {1..100}&lt;br&gt;
do&lt;br&gt;
curl localhost&lt;br&gt;
done&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Observe:

* access logs
* Loki queries
* Grafana live logs

---

# LAB 2 — Find Traffic

Query:



```logql id="g6c3nl"
{job="nginx_access"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Understand:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;requests&lt;/li&gt;
&lt;li&gt;clients&lt;/li&gt;
&lt;li&gt;timestamps&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  LAB 3 — Generate 404 Errors
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="u37qiw"&lt;br&gt;
curl localhost/fakepage&lt;br&gt;
curl localhost/admin&lt;br&gt;
curl localhost/test&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Now query:



```logql id="povkgx"
{job="nginx_access"} |= "404"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Understand:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;broken URLs&lt;/li&gt;
&lt;li&gt;scanners&lt;/li&gt;
&lt;li&gt;attack attempts&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  LAB 4 — Simulate Attack Traffic
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="wvlxaq"&lt;br&gt;
for i in {1..500}&lt;br&gt;
do&lt;br&gt;
curl localhost/login&lt;br&gt;
done&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Query:



```logql id="fd6l7h"
{job="nginx_access"} |= "/login"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Understand:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;brute force detection&lt;/li&gt;
&lt;li&gt;traffic spikes&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  LAB 5 — Live Log Streaming
&lt;/h1&gt;

&lt;p&gt;In Grafana:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;click LIVE&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="cv5p3f"&lt;br&gt;
curl localhost&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Watch logs appear live.

Production use:

* deployment monitoring
* live incident debugging

---

# LAB 6 — Break Nginx

Stop nginx:



```bash id="u5w6v2"
sudo systemctl stop nginx
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Now:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;website unavailable&lt;/li&gt;
&lt;li&gt;curl fails&lt;/li&gt;
&lt;li&gt;logs stop&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;metrics&lt;/li&gt;
&lt;li&gt;logs&lt;/li&gt;
&lt;li&gt;alerts&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Restart:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="08px7z"&lt;br&gt;
sudo systemctl start nginx&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# LAB 7 — Watch Authentication Logs

Attempt SSH login failures.

Then query:



```logql id="a4rdmv"
{job="auth"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Search:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```logql id="e7m0qx"&lt;br&gt;
{job="auth"} |= "Failed password"&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Understand:

* security monitoring
* intrusion attempts

---

# LAB 8 — Monitor System Logs

Query:



```logql id="gm9ln0"
{job="syslog"}
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;services&lt;/li&gt;
&lt;li&gt;system events&lt;/li&gt;
&lt;li&gt;daemon activity&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  LAB 9 — Create Dashboard
&lt;/h1&gt;

&lt;p&gt;Create panels for:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;request count&lt;/li&gt;
&lt;li&gt;404 count&lt;/li&gt;
&lt;li&gt;login failures&lt;/li&gt;
&lt;li&gt;live logs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is real observability dashboarding.&lt;/p&gt;


&lt;h1&gt;
  
  
  LAB 10 — Correlate Metrics + Logs
&lt;/h1&gt;

&lt;p&gt;Run stress:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="sm0d4m"&lt;br&gt;
stress --cpu 2 --timeout 300&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Check:

* Prometheus CPU metrics
* Loki logs

Understand:

* metric/log correlation

---

# Common Production Problems

| Problem            | Cause               |
| ------------------ | ------------------- |
| No logs            | Alloy stopped       |
| Missing labels     | bad config          |
| Empty Grafana      | wrong query         |
| High Loki storage  | too many logs       |
| Slow queries       | bad labels          |
| Missing nginx logs | wrong file path     |
| Duplicate logs     | multiple collectors |

---

# What You Must Know For Interviews

## Loki

* labels
* LogQL
* centralized logging
* storage
* retention
* troubleshooting

## Alloy

* collectors
* pipelines
* log shipping
* OpenTelemetry
* forwarding

## SRE Concepts

* observability
* root cause analysis
* metrics vs logs
* incident response
* correlation

---

# VERY IMPORTANT INTERVIEW QUESTIONS

## Why Loki instead of ELK?

Answer:

* cheaper
* lightweight
* label indexing
* easier scaling

---

## What does Alloy do?

Answer:



```text id="f3t1uh"
Collects and forwards telemetry:
- logs
- metrics
- traces
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Difference between Alloy and Loki?
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Alloy&lt;/th&gt;
&lt;th&gt;Loki&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;collector&lt;/td&gt;
&lt;td&gt;storage&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ships logs&lt;/td&gt;
&lt;td&gt;stores logs&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;reads files&lt;/td&gt;
&lt;td&gt;indexes logs&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h1&gt;
  
  
  FINAL UNDERSTANDING
&lt;/h1&gt;

&lt;p&gt;You successfully built a REAL observability system used by:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;DevOps engineers&lt;/li&gt;
&lt;li&gt;SRE engineers&lt;/li&gt;
&lt;li&gt;platform engineers&lt;/li&gt;
&lt;li&gt;cloud engineers&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is NOT beginner work anymore.&lt;/p&gt;

&lt;p&gt;You are now doing:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;centralized logging&lt;/li&gt;
&lt;li&gt;observability engineering&lt;/li&gt;
&lt;li&gt;incident investigation&lt;/li&gt;
&lt;li&gt;production troubleshooting&lt;/li&gt;
&lt;li&gt;log analytics&lt;/li&gt;
&lt;li&gt;root cause analysis&lt;/li&gt;
&lt;/ul&gt;

</description>
    </item>
    <item>
      <title>Production-Level SRE Lab — Prometheus + Grafana Incident Simulation</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Wed, 20 May 2026 13:48:30 +0000</pubDate>
      <link>https://dev.to/jumptotech/production-level-sre-lab-prometheus-grafana-incident-simulation-3eo4</link>
      <guid>https://dev.to/jumptotech/production-level-sre-lab-prometheus-grafana-incident-simulation-3eo4</guid>
      <description>&lt;h2&gt;
  
  
  Scenario
&lt;/h2&gt;

&lt;p&gt;You are Senior SRE on-call.&lt;/p&gt;

&lt;p&gt;Production users report:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmmd"&lt;br&gt;
Application is slow and timing out&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Your mission:

* detect issue
* investigate metrics
* identify bottleneck
* create dashboards
* analyze telemetry
* verify recovery

This lab simulates REAL production troubleshooting.

---

# Architecture



```text id="jlwml8"
Linux EC2
   ↓
Node Exporter
   ↓
Prometheus
   ↓
Grafana
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  Skills Practiced
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Skill&lt;/th&gt;
&lt;th&gt;Production Relevance&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;PromQL&lt;/td&gt;
&lt;td&gt;real incident debugging&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Grafana dashboards&lt;/td&gt;
&lt;td&gt;observability&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Linux telemetry&lt;/td&gt;
&lt;td&gt;bottleneck analysis&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;CPU saturation&lt;/td&gt;
&lt;td&gt;scaling decisions&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;memory pressure&lt;/td&gt;
&lt;td&gt;OOM prevention&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;disk saturation&lt;/td&gt;
&lt;td&gt;outage prevention&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;alerting&lt;/td&gt;
&lt;td&gt;incident response&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;exporter failures&lt;/td&gt;
&lt;td&gt;monitoring reliability&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  Phase 1 — Validate Monitoring Stack
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Step 1 — Check Targets
&lt;/h2&gt;

&lt;p&gt;Open:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmy2"&lt;br&gt;
&lt;a href="http://YOUR_IP:9090/targets" rel="noopener noreferrer"&gt;http://YOUR_IP:9090/targets&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Expected:

* prometheus = UP
* node_exporter = UP

Production importance:

* monitoring itself must be healthy

---

# Step 2 — Verify Exporter Metrics

Run:



```bash id="jlwmg0"
curl localhost:9100/metrics | grep node_cpu
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Production importance:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;validates exporter telemetry&lt;/li&gt;
&lt;li&gt;validates scrape endpoint&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Phase 2 — Build Production Dashboard Mentality
&lt;/h1&gt;

&lt;p&gt;Open Grafana:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmdw"&lt;br&gt;
&lt;a href="http://YOUR_IP:3000" rel="noopener noreferrer"&gt;http://YOUR_IP:3000&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Open:



```text id="jlwmt6"
Node Exporter Full
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Focus ONLY on:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Panel&lt;/th&gt;
&lt;th&gt;Why Important&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;CPU Busy&lt;/td&gt;
&lt;td&gt;saturation&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Sys Load&lt;/td&gt;
&lt;td&gt;scheduler contention&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;CPU Pressure&lt;/td&gt;
&lt;td&gt;waiting tasks&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;RAM Used&lt;/td&gt;
&lt;td&gt;memory exhaustion&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Swap Used&lt;/td&gt;
&lt;td&gt;memory pressure&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Root FS Used&lt;/td&gt;
&lt;td&gt;disk exhaustion&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;IOwait&lt;/td&gt;
&lt;td&gt;storage bottlenecks&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Network Traffic&lt;/td&gt;
&lt;td&gt;traffic spikes&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  Phase 3 — Production CPU Incident
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Scenario
&lt;/h2&gt;

&lt;p&gt;Users report:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmdn"&lt;br&gt;
API latency extremely high&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Step 1 — Simulate CPU Saturation

Run:



```bash id="jlwmu6"
stress --cpu 2 --timeout 300
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;This fully loads your:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;2 vCPUs&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Step 2 — Observe Grafana
&lt;/h1&gt;

&lt;p&gt;Watch:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU Busy&lt;/li&gt;
&lt;li&gt;Sys Load&lt;/li&gt;
&lt;li&gt;CPU Pressure&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Expected:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU Busy near 100%&lt;/li&gt;
&lt;li&gt;load increases&lt;/li&gt;
&lt;li&gt;pressure rises&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Step 3 — Investigate with PromQL
&lt;/h1&gt;

&lt;p&gt;Open Prometheus:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmgj"&lt;br&gt;
&lt;a href="http://YOUR_IP:9090" rel="noopener noreferrer"&gt;http://YOUR_IP:9090&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Run:



```text id="jlwmt9"
100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU approaching 100%&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Production importance:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;validates saturation&lt;/li&gt;
&lt;li&gt;confirms resource bottleneck&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Step 4 — Linux Investigation
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmev"&lt;br&gt;
top&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Questions:

* Which process consumes CPU?
* Is load average high?
* How much idle remains?

Exit:



```text id="jlwms5"
q
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  Step 5 — Advanced Analysis
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmyq"&lt;br&gt;
uptime&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Example:



```text id="jlwmy1"
load average: 4.5, 3.9, 2.1
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Your instance:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;2 vCPUs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Interpretation:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;load &amp;gt; CPU count&lt;/li&gt;
&lt;li&gt;tasks waiting&lt;/li&gt;
&lt;li&gt;system saturated&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Senior SRE concept:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmsd"&lt;br&gt;
load average measures runnable/waiting tasks&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Phase 4 — Memory Leak Incident

## Scenario

Application becomes slow after deployment.

Possible memory leak.

---

# Step 1 — Simulate Memory Pressure

Run:



```bash id="jlwmy0"
stress --vm 1 --vm-bytes 300M --timeout 300
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  Step 2 — Observe Grafana
&lt;/h1&gt;

&lt;p&gt;Watch:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;RAM Used&lt;/li&gt;
&lt;li&gt;Memory Pressure&lt;/li&gt;
&lt;li&gt;Swap Used&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Step 3 — Check OOM Events
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmi7"&lt;br&gt;
dmesg | grep -i oom&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Production importance:

* detects memory exhaustion
* confirms kernel intervention

---

# Step 4 — Investigate Memory Consumers

Run:



```bash id="jlwmyf"
htop
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;memory-heavy processes&lt;/li&gt;
&lt;li&gt;swap activity&lt;/li&gt;
&lt;li&gt;CPU behavior&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Phase 5 — Disk Saturation Incident
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Scenario
&lt;/h2&gt;

&lt;p&gt;Monitoring suddenly stops storing data.&lt;/p&gt;


&lt;h1&gt;
  
  
  Step 1 — Simulate Disk Consumption
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmu3"&lt;br&gt;
fallocate -l 2G incidentfile&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Step 2 — Observe Grafana

Watch:

* Root FS Used
* filesystem graphs

---

# Step 3 — Investigate Disk

Run:



```bash id="jlwmsu"
df -h
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Then:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmu4"&lt;br&gt;
sudo du -sh /var/lib/prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Production importance:

* Prometheus itself consumes disk
* metrics retention matters

---

# Step 4 — Cleanup

Run:



```bash id="jlwmbm"
rm -f incidentfile
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe recovery.&lt;/p&gt;


&lt;h1&gt;
  
  
  Phase 6 — Monitoring Failure Incident
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Scenario
&lt;/h2&gt;

&lt;p&gt;Dashboards suddenly show:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmd8"&lt;br&gt;
No Data&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Step 1 — Simulate Exporter Failure

Run:



```bash id="jlwml9"
sudo systemctl stop node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  Step 2 — Observe
&lt;/h1&gt;

&lt;p&gt;Prometheus Targets:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwmy9"&lt;br&gt;
DOWN&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Grafana:

* panels fail
* gaps appear

Production importance:

* monitoring outages
* exporter failures
* telemetry gaps

---

# Step 3 — Recovery

Run:



```bash id="jlwmgq"
sudo systemctl start node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;recovery&lt;/li&gt;
&lt;li&gt;targets UP&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Phase 7 — Production Alerting
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Create CPU Alert
&lt;/h2&gt;

&lt;p&gt;Create:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmu0"&lt;br&gt;
sudo nano /etc/prometheus/alert.rules.yml&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Add:



```yaml id="jlwms7"
groups:
- name: sre-alerts
  rules:
  - alert: HighCPUUsage
    expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) &amp;gt; 70
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: High CPU Usage
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  Step 2 — Add Rule to Prometheus
&lt;/h1&gt;

&lt;p&gt;Edit:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmt2"&lt;br&gt;
sudo nano /etc/prometheus/prometheus.yml&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Add:



```yaml id="jlwmu7"
rule_files:
  - "alert.rules.yml"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h1&gt;
  
  
  Step 3 — Restart Prometheus
&lt;/h1&gt;



&lt;p&gt;```bash id="jlwmbt"&lt;br&gt;
sudo systemctl restart prometheus&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Step 4 — Trigger Alert

Run CPU stress again.

Observe:

* alert fires

Production importance:

* incident detection
* automated monitoring

---

# Phase 8 — Senior SRE RCA

Now perform Root Cause Analysis.

Questions:

1. What caused CPU saturation?
2. Did load average confirm contention?
3. Did pressure metrics increase?
4. Was swap used?
5. Was disk healthy?
6. Did Prometheus capture telemetry correctly?
7. Did Grafana visualize incident correctly?
8. Which process caused issue?
9. How would you scale production?
10. Would horizontal or vertical scaling help?

---

# Phase 9 — Real Production Thinking

## What Senior SRE Engineers Actually Analyze

Not just:



```text id="jlwmu2"
CPU %
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;But:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;load&lt;/li&gt;
&lt;li&gt;pressure&lt;/li&gt;
&lt;li&gt;iowait&lt;/li&gt;
&lt;li&gt;steal time&lt;/li&gt;
&lt;li&gt;saturation&lt;/li&gt;
&lt;li&gt;latency&lt;/li&gt;
&lt;li&gt;telemetry gaps&lt;/li&gt;
&lt;li&gt;retention&lt;/li&gt;
&lt;li&gt;cardinality&lt;/li&gt;
&lt;li&gt;alert fatigue&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  Phase 10 — Final Architecture Understanding
&lt;/h1&gt;



&lt;p&gt;```text id="jlwmd5"&lt;br&gt;
Linux Kernel&lt;br&gt;
   ↓&lt;br&gt;
Node Exporter&lt;br&gt;
   ↓&lt;br&gt;
Prometheus Scraping&lt;br&gt;
   ↓&lt;br&gt;
Prometheus TSDB&lt;br&gt;
   ↓&lt;br&gt;
Grafana Visualization&lt;br&gt;
   ↓&lt;br&gt;
Alerting&lt;br&gt;
   ↓&lt;br&gt;
Incident Response&lt;br&gt;
   ↓&lt;br&gt;
SRE Investigation&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


This is extremely close to real production observability engineering used by Senior SRE teams.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
    </item>
    <item>
      <title>Lab 2 — Your First Grafana Dashboard (Built from Scratch)</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Tue, 19 May 2026 01:34:44 +0000</pubDate>
      <link>https://dev.to/jumptotech/lab-2-your-first-grafana-dashboard-built-from-scratch-a3b</link>
      <guid>https://dev.to/jumptotech/lab-2-your-first-grafana-dashboard-built-from-scratch-a3b</guid>
      <description>&lt;h2&gt;
  
  
  What you will build
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Panel&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;What it measures&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;CPU usage %&lt;/td&gt;
&lt;td&gt;Time series&lt;/td&gt;
&lt;td&gt;How hard your cores are working&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;System load (1m/5m/15m)&lt;/td&gt;
&lt;td&gt;Time series&lt;/td&gt;
&lt;td&gt;Load average trend&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Memory used vs available&lt;/td&gt;
&lt;td&gt;Time series&lt;/td&gt;
&lt;td&gt;RAM consumption&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Disk usage %&lt;/td&gt;
&lt;td&gt;Gauge&lt;/td&gt;
&lt;td&gt;How full your root partition is&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Network traffic (in/out)&lt;/td&gt;
&lt;td&gt;Time series&lt;/td&gt;
&lt;td&gt;Bytes per second on eth0&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Key metrics snapshot&lt;/td&gt;
&lt;td&gt;Table&lt;/td&gt;
&lt;td&gt;Current values side by side&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h2&gt;
  
  
  Part 1 — Connect Grafana to Prometheus
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;Open Grafana: &lt;code&gt;http://&amp;lt;EC2-PUBLIC-IP&amp;gt;:3000&lt;/code&gt; (default login: &lt;code&gt;admin / admin&lt;/code&gt;)&lt;/li&gt;
&lt;li&gt;Go to &lt;strong&gt;Connections → Data sources → Add data source&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;Select &lt;strong&gt;Prometheus&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;Set URL to &lt;code&gt;http://localhost:9090&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Click &lt;strong&gt;Save &amp;amp; test&lt;/strong&gt; — you should see a green success message&lt;/li&gt;
&lt;/ol&gt;

&lt;blockquote&gt;
&lt;p&gt;Why &lt;code&gt;localhost&lt;/code&gt;? Grafana and Prometheus live on the same machine. Using localhost avoids exposing Prometheus to the internet.&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  Part 2 — Panel 1: CPU Usage %
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Time series&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Settings:&lt;/strong&gt; Unit → &lt;code&gt;Percent (0-100)&lt;/code&gt; · Min &lt;code&gt;0&lt;/code&gt; · Max &lt;code&gt;100&lt;/code&gt; · Threshold warning at &lt;code&gt;70&lt;/code&gt;, critical at &lt;code&gt;90&lt;/code&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Verify:&lt;/strong&gt; Run &lt;code&gt;stress-ng --cpu 2 --timeout 60s&lt;/code&gt; and watch the panel spike.&lt;/p&gt;




&lt;h2&gt;
  
  
  Part 3 — Panel 2: System Load Average
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Time series — add 3 queries:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;node_load1    # legend: 1m load
node_load5    # legend: 5m load
node_load15   # legend: 15m load
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Settings:&lt;/strong&gt; Unit → &lt;code&gt;Short&lt;/code&gt; · Add a threshold line at your core count (e.g. 2 for t2.micro)&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Load above core count = system is struggling to keep up.&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  Part 4 — Panel 3: Memory Usage
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Time series — add 2 queries:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes   # legend: Used
node_memory_MemAvailable_bytes                                 # legend: Available
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Settings:&lt;/strong&gt; Unit → &lt;code&gt;bytes(IEC)&lt;/code&gt; · Enable &lt;strong&gt;Stack series&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Verify:&lt;/strong&gt; &lt;code&gt;stress-ng --vm 1 --vm-bytes 400M --timeout 60s&lt;/code&gt;&lt;/p&gt;




&lt;h2&gt;
  
  
  Part 5 — Panel 4: Disk Usage (Gauge)
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Gauge&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} 
      / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Settings:&lt;/strong&gt; Unit → &lt;code&gt;Percent (0-100)&lt;/code&gt; · Thresholds: &lt;code&gt;0&lt;/code&gt; green → &lt;code&gt;70&lt;/code&gt; yellow → &lt;code&gt;85&lt;/code&gt; red&lt;/p&gt;




&lt;h2&gt;
  
  
  Part 6 — Panel 5: Network Traffic
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Time series — add 2 queries:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;rate(node_network_receive_bytes_total{device="eth0"}[5m])    # legend: In
rate(node_network_transmit_bytes_total{device="eth0"}[5m])   # legend: Out
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Settings:&lt;/strong&gt; Unit → &lt;code&gt;bytes/sec(IEC)&lt;/code&gt;&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;If your instance uses &lt;code&gt;ens5&lt;/code&gt; instead of &lt;code&gt;eth0&lt;/code&gt;, check with: &lt;code&gt;node_network_receive_bytes_total&lt;/code&gt; in the Prometheus UI and look at the &lt;code&gt;device&lt;/code&gt; label.&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  Part 7 — Panel 6: Key Metrics Table
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Visualization:&lt;/strong&gt; Table — set each query to &lt;strong&gt;Instant&lt;/strong&gt;, add Reduce transformation → Last&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)          # CPU %
node_load1                                                                   # Load 1m
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) 
  / node_memory_MemTotal_bytes * 100                                        # Memory %
100 - ((node_filesystem_avail_bytes{mountpoint="/"} 
      / node_filesystem_size_bytes{mountpoint="/"}) * 100)                 # Disk %
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Part 8 — Save and stress test
&lt;/h2&gt;

&lt;p&gt;Save the dashboard as &lt;strong&gt;EC2 System Health&lt;/strong&gt;. Set auto-refresh to &lt;code&gt;30s&lt;/code&gt;.&lt;/p&gt;

&lt;p&gt;Then run all three stressors simultaneously:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;stress-ng &lt;span class="nt"&gt;--cpu&lt;/span&gt; 2 &lt;span class="nt"&gt;--timeout&lt;/span&gt; 120s &amp;amp;
stress-ng &lt;span class="nt"&gt;--vm&lt;/span&gt; 1 &lt;span class="nt"&gt;--vm-bytes&lt;/span&gt; 500M &lt;span class="nt"&gt;--timeout&lt;/span&gt; 120s &amp;amp;
stress-ng &lt;span class="nt"&gt;--io&lt;/span&gt; 2 &lt;span class="nt"&gt;--timeout&lt;/span&gt; 120s
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Watch all panels respond in real time. After stress ends, verify metrics return to baseline.&lt;/p&gt;




&lt;h2&gt;
  
  
  Checkpoint questions
&lt;/h2&gt;

&lt;p&gt;Answer before moving to Lab 3:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Why use &lt;code&gt;rate()&lt;/code&gt; on &lt;code&gt;node_cpu_seconds_total&lt;/code&gt; instead of the raw value?&lt;/li&gt;
&lt;li&gt;What does load average &lt;code&gt;4.0&lt;/code&gt; mean on a 2-core machine?&lt;/li&gt;
&lt;li&gt;Why does the disk query use &lt;code&gt;avail_bytes&lt;/code&gt; not &lt;code&gt;free_bytes&lt;/code&gt;?&lt;/li&gt;
&lt;li&gt;What changes if you switch &lt;code&gt;[5m]&lt;/code&gt; to &lt;code&gt;[1m]&lt;/code&gt; in the CPU query?&lt;/li&gt;
&lt;li&gt;Why is &lt;code&gt;avg by(instance)&lt;/code&gt; important in a multi-host setup?&lt;/li&gt;
&lt;/ol&gt;

</description>
    </item>
    <item>
      <title>Advanced SRE Lab — Prometheus + Grafana + Node Exporter (6 Years Experience Level)</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Mon, 18 May 2026 06:15:25 +0000</pubDate>
      <link>https://dev.to/jumptotech/advanced-sre-lab-prometheus-grafana-node-exporter-6-years-experience-level-42lc</link>
      <guid>https://dev.to/jumptotech/advanced-sre-lab-prometheus-grafana-node-exporter-6-years-experience-level-42lc</guid>
      <description>&lt;p&gt;Goal:&lt;br&gt;
Learn monitoring and observability like a real Senior SRE.&lt;/p&gt;

&lt;p&gt;You will simulate:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;production incidents&lt;/li&gt;
&lt;li&gt;CPU bottlenecks&lt;/li&gt;
&lt;li&gt;memory pressure&lt;/li&gt;
&lt;li&gt;disk saturation&lt;/li&gt;
&lt;li&gt;exporter failures&lt;/li&gt;
&lt;li&gt;alerting&lt;/li&gt;
&lt;li&gt;troubleshooting&lt;/li&gt;
&lt;li&gt;PromQL analysis&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Architecture:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="9n3a7x"&lt;br&gt;
Linux EC2&lt;br&gt;
   ↓&lt;br&gt;
Node Exporter&lt;br&gt;
   ↓&lt;br&gt;
Prometheus&lt;br&gt;
   ↓&lt;br&gt;
Grafana&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Scenario

You are Senior SRE on-call.

Production application is slow.

Your job:

* detect issue
* analyze metrics
* identify bottleneck
* troubleshoot
* recover service

---

# Phase 1 — Verify Monitoring Stack

## Task 1 — Verify Node Exporter

Run:



```bash id="jlwm0k"
curl localhost:9100/metrics | head
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Questions:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;What does exporter expose?&lt;/li&gt;
&lt;li&gt;Is it storing data?&lt;/li&gt;
&lt;li&gt;Why plaintext metrics?&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Expected understanding:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;exporter only exposes metrics endpoint&lt;/li&gt;
&lt;li&gt;stateless component&lt;/li&gt;
&lt;li&gt;lightweight collector&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Task 2 — Verify Prometheus Scraping
&lt;/h1&gt;

&lt;p&gt;Open:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwm47"&lt;br&gt;
&lt;a href="http://YOUR_IP:9090/targets" rel="noopener noreferrer"&gt;http://YOUR_IP:9090/targets&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Check:

* prometheus → UP
* node_exporter → UP

Questions:

* Why pull model instead of push?
* What happens if exporter dies?
* What happens if Prometheus dies?

---

# Task 3 — Prometheus Queries

Open:



```text id="jlwm8v"
http://YOUR_IP:9090
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Run:&lt;/p&gt;
&lt;h2&gt;
  
  
  CPU Utilization
&lt;/h2&gt;



&lt;p&gt;```text id="jlwm4t"&lt;br&gt;
100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


## Memory Available



```text id="jlwmx7"
node_memory_MemAvailable_bytes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;h2&gt;
  
  
  Filesystem Usage
&lt;/h2&gt;



&lt;p&gt;```text id="jlwm4j"&lt;br&gt;
100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes)&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Questions:

* Why rate() used?
* Why idle mode subtracted?
* Why time-series important?

---

# Phase 2 — Production Incident Simulation

# Incident 1 — CPU Saturation

## Task

Run:



```bash id="jlwm0m"
stress --cpu 2 --timeout 180
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe in Grafana:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;CPU Busy&lt;/li&gt;
&lt;li&gt;Sys Load&lt;/li&gt;
&lt;li&gt;CPU Pressure&lt;/li&gt;
&lt;li&gt;Idle CPU&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Questions:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Difference between CPU Busy vs Load?&lt;/li&gt;
&lt;li&gt;Why pressure matters?&lt;/li&gt;
&lt;li&gt;When does scaling become necessary?&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Expected Senior SRE Analysis:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;sustained high CPU&lt;/li&gt;
&lt;li&gt;rising load average&lt;/li&gt;
&lt;li&gt;scheduler contention&lt;/li&gt;
&lt;li&gt;resource saturation&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Incident 2 — Memory Pressure
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmt9"&lt;br&gt;
stress --vm 1 --vm-bytes 300M --timeout 180&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Observe:

* RAM Used
* Memory Pressure
* Swap Used

Then:



```bash id="jlwme5"
dmesg | grep -i oom
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Questions:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;What triggers OOM killer?&lt;/li&gt;
&lt;li&gt;Difference between cache vs real memory exhaustion?&lt;/li&gt;
&lt;li&gt;Why swap dangerous for latency-sensitive apps?&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Expected Senior SRE Analysis:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;memory pressure impacts performance before OOM&lt;/li&gt;
&lt;li&gt;swap causes latency spikes&lt;/li&gt;
&lt;li&gt;memory leaks vs burst usage&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Incident 3 — Disk Saturation
&lt;/h1&gt;

&lt;p&gt;Create large files:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmyk"&lt;br&gt;
fallocate -l 2G incidentfile&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Observe:

* Root FS Used
* Disk IO
* Filesystem metrics

Questions:

* What happens if Prometheus disk fills?
* Why monitoring systems themselves need monitoring?
* How retention policies work?

---

# Phase 3 — Exporter Failure

## Task

Stop exporter:



```bash id="jlwm2y"
sudo systemctl stop node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Observe:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus Targets → DOWN&lt;/li&gt;
&lt;li&gt;Grafana panels fail&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Questions:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Why dashboards fail?&lt;/li&gt;
&lt;li&gt;Difference between exporter outage vs server outage?&lt;/li&gt;
&lt;li&gt;How to alert on missing metrics?&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Recover:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmx8"&lt;br&gt;
sudo systemctl start node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# Phase 4 — Prometheus Storage Internals

## Task

Inspect TSDB:



```bash id="jlwm6k"
sudo du -sh /var/lib/prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Inspect contents:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwm3o"&lt;br&gt;
ls /var/lib/prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Questions:

* What is WAL?
* Why chunk storage?
* Why Prometheus disk grows over time?

Expected understanding:
Write-ahead logging

---

# Phase 5 — Alerting

## Create CPU Alert

Create:



```bash id="jlwmlo"
sudo nano /etc/prometheus/alert.rules.yml
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Add:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```yaml id="jlwm4u"&lt;br&gt;
groups:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;name: sre-alerts
rules:

&lt;ul&gt;
&lt;li&gt;alert: HighCPUUsage
expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) &amp;gt; 70
for: 1m
labels:
severity: critical
annotations:
summary: High CPU Usage
```
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Add to prometheus.yml:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```yaml id="jlwm0f"&lt;br&gt;
rule_files:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;"alert.rules.yml"
```
&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Restart:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="jlwmcl"&lt;br&gt;
sudo systemctl restart prometheus&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Trigger stress again.

Questions:

* Why “for: 1m” important?
* Why avoid noisy alerts?
* Difference between symptom vs cause alerts?

---

# Phase 6 — Real SRE Troubleshooting

## Scenario

Dashboard shows:

* CPU normal
* Load high
* IOwait high

Questions:

* What does that indicate?
* Why app still slow?

Expected answer:

* storage bottleneck
* tasks blocked waiting for IO
* CPU not actual issue

---

# Phase 7 — Capacity Planning

Questions:

* When scale vertically?
* When scale horizontally?
* Why t3.micro unsuitable for production monitoring?
* How Prometheus retention affects storage sizing?

---

# Senior SRE Concepts You Must Understand

| Concept          | Expected Understanding  |
| ---------------- | ----------------------- |
| Pull model       | Prometheus scraping     |
| Time-series DB   | metric history          |
| Pressure metrics | resource contention     |
| Load average     | runnable/waiting tasks  |
| OOM killer       | memory protection       |
| WAL              | crash-safe storage      |
| Alert fatigue    | noisy alert problems    |
| Cardinality      | metric explosion risk   |
| Retention        | storage lifecycle       |
| Observability    | metrics + logs + traces |

---

# Final Senior-Level Goal

You should now explain:



```text id="jlwm8z"
How Linux metrics flow from kernel → exporter → Prometheus → Grafana → alerting → SRE response
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;That is real production observability engineering.&lt;/p&gt;

</description>
    </item>
    <item>
      <title>project #2: The Apex Platform</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Thu, 14 May 2026 20:57:54 +0000</pubDate>
      <link>https://dev.to/jumptotech/project-2-the-apex-platform-4j15</link>
      <guid>https://dev.to/jumptotech/project-2-the-apex-platform-4j15</guid>
      <description>&lt;p&gt;In a senior role at a major institution like &lt;strong&gt;Nexus Bank&lt;/strong&gt;, you don't just "work in a team"—you operate within a &lt;strong&gt;Squad&lt;/strong&gt; that is part of a larger &lt;strong&gt;Chapter&lt;/strong&gt; or &lt;strong&gt;Platform Group&lt;/strong&gt;.&lt;/p&gt;

&lt;p&gt;Understanding this hierarchy is vital for an interview because it proves you’ve worked in an enterprise "at scale," not just a small startup.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;1. Your Core Team: The "Platform Squad"&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;In a bank, the ideal team size follows the &lt;strong&gt;"Two-Pizza Rule"&lt;/strong&gt; (usually 6 to 8 people). This ensures the team is small enough to be agile but large enough to handle high-stakes infrastructure.&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Squad Composition&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;1 Product Owner (PO):&lt;/strong&gt; They don't write code. They manage the "Backlog" and decide which business features (like "Multi-Region DR") are the highest priority.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;1 Scrum Master:&lt;/strong&gt; They facilitate the Daily Stand-ups and remove "blockers" (e.g., if the Security team hasn't approved your PR yet).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;1 Tech Lead / Architect:&lt;/strong&gt; The most senior person who makes the final decision on high-level architecture.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;3–4 Senior Cloud Platform Engineers (You):&lt;/strong&gt; You are the engine. You write the Terraform, design the ECS tasks, and handle the "on-call" rotations.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;1–2 Junior/Mid-level Engineers:&lt;/strong&gt; You often mentor them, performing their code reviews and helping them understand Linux networking.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;2. The Wider Organization: The Central Platform Group (CPG)&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;While your squad is 8 people, you are part of a &lt;strong&gt;Central Platform Group&lt;/strong&gt; of about &lt;strong&gt;30 to 50 engineers&lt;/strong&gt;. At Nexus Bank, this group is usually split into specialized squads:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Compute Squad (Your Squad):&lt;/strong&gt; Focused on ECS, EC2, and Scaling.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Data Platform Squad:&lt;/strong&gt; Focused on RDS, DynamoDB, and Data Encryption.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Security &amp;amp; Identity Squad:&lt;/strong&gt; Focused on IAM, Vault, and Compliance.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Observability Squad:&lt;/strong&gt; Focused on the Prometheus/Grafana stack and logging.&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;3. How You "Manage" These Relationships&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;As a &lt;strong&gt;Senior Engineer&lt;/strong&gt;, you don't "manage" people in terms of hiring/firing (that’s the Manager’s job), but you &lt;strong&gt;manage technical outcomes and stakeholders.&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Managing Your Squad Peers&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Code Reviews:&lt;/strong&gt; You spend 1–2 hours a day reviewing Terraform code. You ensure no one is hardcoding secrets and that every resource has the correct banking tags (e.g., &lt;code&gt;CostCenter&lt;/code&gt;, &lt;code&gt;Environment&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Knowledge Sharing:&lt;/strong&gt; You lead "Brown Bag" sessions where you teach the team a new trick in Linux or a better way to structure Terraform modules.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Managing Cross-Team Collaboration&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;The Network Team (The Gatekeepers):&lt;/strong&gt; You don't "manage" them; you &lt;strong&gt;negotiate&lt;/strong&gt; with them. When you need a new Transit Gateway attachment, you provide them with the technical CIDR ranges and architectural justification so they approve your request quickly.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The App-Dev Teams (The Customers):&lt;/strong&gt; You treat the 500+ developers as your "customers." You manage this by creating &lt;strong&gt;Self-Service Templates&lt;/strong&gt;.&lt;/li&gt;
&lt;li&gt;
&lt;em&gt;Example:&lt;/em&gt; Instead of manually building an ECS service for every developer, you provide a "Golden Terraform Module" they can use. This reduces your workload and keeps the bank secure.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is a classic "Senior Engineer" interview question: &lt;strong&gt;"Tell me about a time you had a conflict with another team."&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;In a bank, this conflict almost always involves the &lt;strong&gt;Security Team&lt;/strong&gt;. They are paid to be paranoid, and you are paid to build a functional platform. At the 6-year level, you don't "fight" them—you &lt;strong&gt;negotiate with architecture.&lt;/strong&gt;&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;The Conflict Scenario: "The Egress Deadlock"&lt;/strong&gt;
&lt;/h2&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Situation&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;Your team is building the &lt;strong&gt;Apex Payment Gateway&lt;/strong&gt; on ECS. To finish the sprint, the developers need the containers to reach out to a third-party Credit Scoring API and pull updated container images from a public registry.&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Conflict&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Your Team’s Position:&lt;/strong&gt; "We want to put a NAT Gateway in the VPC so our ECS tasks can reach the internet. It’s fast, standard, and we can finish the project by Friday."&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The Security Team’s Position:&lt;/strong&gt; "Absolutely not. Under &lt;strong&gt;PCI-DSS Compliance&lt;/strong&gt;, no production resource handling credit card data can have a direct route to the internet. NAT Gateways are too 'open.' Request denied."&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Deadlock&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;The project stops. The developers are frustrated because they can’t test their code, and your manager is worried about the deadline.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;How a Senior Engineer Manages the Resolution&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;A 6-year veteran doesn't just complain. They host a &lt;strong&gt;Technical Alignment Meeting&lt;/strong&gt; and propose a "Middle Ground" architecture that satisfies both speed and safety.&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;1. The "Senior" Negotiation (The Meeting)&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;You invite the &lt;strong&gt;Security Architect&lt;/strong&gt; to a 30-minute deep dive. Instead of asking for a NAT Gateway again, you say:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"I understand the risk of data exfiltration via a NAT Gateway. My goal is to ensure the Apex Gateway remains isolated while still allowing our services to function. What if we eliminate the NAT Gateway entirely and use a *&lt;/em&gt;'Deny-by-Default'** egress architecture?"*&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;2. The Technical Solution (The "Win-Win")&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;You propose and implement three layers of "Invisible" security using &lt;strong&gt;Terraform&lt;/strong&gt;:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Layer 1: VPC Endpoints (PrivateLink):&lt;/strong&gt; Instead of going over the internet to talk to AWS services (like S3 or Secrets Manager), you provision Interface Endpoints. Traffic never leaves the bank's network.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Layer 2: Centralized Egress Proxy:&lt;/strong&gt; You suggest routing all remaining external traffic through a &lt;strong&gt;Squid Proxy&lt;/strong&gt; or an &lt;strong&gt;AWS Network Firewall&lt;/strong&gt; sitting in a separate "Inspection VPC."&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Layer 3: FQDN Whitelisting:&lt;/strong&gt; You tell Security: "We will only whitelist the specific domain of the Credit Scoring API. Everything else—literally the rest of the internet—will be hard-blocked at the firewall level."&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;3. The Narrative for the Interview&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;When the interviewer asks how you handle conflict, you use this story. It proves you understand &lt;strong&gt;Security, Networking, and Stakeholder Management.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Response:&lt;/strong&gt;&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"During Project Apex, we hit a roadblock where the Security team blocked our deployment because our ECS tasks required internet access for third-party API calls. My team wanted a quick fix with a NAT Gateway, but Security flagged it as a PCI-DSS violation.&lt;/em&gt;&lt;br&gt;
&lt;em&gt;I took the lead on the negotiation. I realized that Security wasn't trying to be difficult—they were protecting the bank's license. I researched and proposed a **Zero-Trust Egress Architecture&lt;/em&gt;&lt;em&gt;. I refactored our Terraform modules to use **AWS PrivateLink&lt;/em&gt;* for all internal AWS traffic and set up a centralized &lt;strong&gt;Network Firewall&lt;/strong&gt; for whitelisted egress.*&lt;br&gt;
&lt;em&gt;By presenting a solution that addressed their specific fear (unauthorized data exfiltration) while still meeting our delivery date, I gained their approval. We actually ended up using this 'Locked-down VPC' as the **Golden Template&lt;/em&gt;* for all future projects at the bank."*&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Why this "Invisible" part of the job is vital:&lt;/strong&gt;
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Empathy:&lt;/strong&gt; You showed you understood the Security Team's job.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Authority:&lt;/strong&gt; You used advanced AWS networking concepts (PrivateLink, FQDN filtering).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Leadership:&lt;/strong&gt; You turned a "No" into a "Yes" without compromising the bank's safety.&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Final Pro-Tip:&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;In an interview, never say "The Security team was wrong." Always say &lt;strong&gt;"We had different priorities, and my role was to find the architectural bridge between them."&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;As a &lt;strong&gt;Senior Cloud Platform Engineer&lt;/strong&gt;, your value isn't just in the code you write, but in the mistakes you prevent others from making. When you review a Pull Request (PR) for &lt;strong&gt;Project Apex&lt;/strong&gt;, you are looking for "smells"—small signs that the infrastructure might be insecure, expensive, or hard to recover.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Senior-Level Code Review Checklist&lt;/strong&gt; specifically for a banking ECS/Terraform environment.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;1. The Security &amp;amp; Identity Pillar&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;[ ] The "Wildcard" Hunt:&lt;/strong&gt; Does any IAM policy contain &lt;code&gt;Resource: "*"&lt;/code&gt; or &lt;code&gt;Action: "s3:*"&lt;/code&gt;?&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Reject it. Every role must be scoped to the specific bucket or API action needed.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Secret Leaks:&lt;/strong&gt; Are there any hardcoded passwords, API keys, or database URIs?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Ensure they are using &lt;code&gt;data "aws_secretsmanager_secret"&lt;/code&gt; or injecting them as environment variables via ECS Secret injection.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Encryption job-zero:&lt;/strong&gt; Is &lt;code&gt;encryption_at_rest&lt;/code&gt; enabled for every S3 bucket and RDS instance? Is a custom &lt;strong&gt;KMS Key&lt;/strong&gt; used instead of the default AWS-managed key? (Banks require custom keys for better audit trails).&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;2. The Networking Pillar&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;[ ] Public IP Check:&lt;/strong&gt; Are any ECS tasks or RDS instances assigned a public IP?&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; In a bank, nothing in a private subnet should have &lt;code&gt;associate_public_ip_address = true&lt;/code&gt;.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Security Group "Laziness":&lt;/strong&gt; Is there a rule allowing &lt;code&gt;0.0.0.0/0&lt;/code&gt; on port 22 or 5432?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Only allow specific Security Group IDs (chaining). For example, the Database SG should only accept traffic from the ECS Task SG.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Missing VPC Endpoints:&lt;/strong&gt; Is the code trying to reach S3 or ECR over the internet?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Ask the dev to add &lt;strong&gt;VPC Gateway Endpoints&lt;/strong&gt; for S3 to keep traffic inside the AWS network.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;3. The Reliability &amp;amp; Operations Pillar&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;[ ] The "Noisy Neighbor" Prevention:&lt;/strong&gt; Are &lt;code&gt;cpu&lt;/code&gt; and &lt;code&gt;memory&lt;/code&gt; limits defined in the ECS Task Definition?&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Without limits, one buggy container can starve the entire EC2 host.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Multi-AZ Deployment:&lt;/strong&gt; Is the &lt;code&gt;subnets&lt;/code&gt; variable for the ECS Service pulling from at least two Availability Zones?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Single-AZ deployments are a "fail" for Project Apex's 99.99% uptime goal.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Health Check Configuration:&lt;/strong&gt; Is there a proper &lt;code&gt;health_check&lt;/code&gt; defined for the Load Balancer? (e.g., &lt;code&gt;/health&lt;/code&gt; instead of just checking if port 80 is open).&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;4. The "Invisible" Maintainability Pillar&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;[ ] Standardized Tagging:&lt;/strong&gt; Are the mandatory bank tags present? (&lt;code&gt;Owner&lt;/code&gt;, &lt;code&gt;CostCenter&lt;/code&gt;, &lt;code&gt;Environment&lt;/code&gt;, &lt;code&gt;Project: Apex&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; If it's not tagged, we can't track the $5,000 monthly bill.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] Variable Validation:&lt;/strong&gt; Does the code use Terraform &lt;code&gt;validation&lt;/code&gt; blocks?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Example:&lt;/em&gt; If a variable is for &lt;code&gt;environment&lt;/code&gt;, does it check that the input is only &lt;code&gt;dev&lt;/code&gt;, &lt;code&gt;uat&lt;/code&gt;, or &lt;code&gt;prod&lt;/code&gt;?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;[ ] The "Delete" Test:&lt;/strong&gt; If I run &lt;code&gt;terraform destroy&lt;/code&gt; on this module, will it accidentally delete the production database?&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;em&gt;Senior Move:&lt;/em&gt; Ensure &lt;code&gt;deletion_protection = true&lt;/code&gt; is set for all production RDS and S3 resources.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;How to use this in an Interview&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;If an interviewer asks, &lt;strong&gt;"How do you ensure code quality in your team?"&lt;/strong&gt;, don't just say "we do code reviews." Say this:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"In Project Apex, I established a **Rigid Review Protocol&lt;/em&gt;&lt;em&gt;. I personally focused on catching 'Architectural Drift.' For example, I'd look for any IAM roles that violated the **Principle of Least Privilege&lt;/em&gt;* or networking rules that lacked &lt;strong&gt;VPC Endpoints&lt;/strong&gt;. I see code review as a mentorship opportunity; if I see a junior engineer hardcoding a subnet ID, I don't just fix it—I explain how using &lt;code&gt;data&lt;/code&gt; sources makes our infrastructure region-agnostic and disaster-recovery ready."*&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The "Invisible" Result:&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;By using this checklist, you aren't just a "Terraform guy." You are a &lt;strong&gt;Guardian of the Bank's Infrastructure.&lt;/strong&gt; You prove that you think about cost, security, and the long-term health of the platform.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;4. The "Invisible" Senior Skill: Managing Up&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;At 6 years of experience, you also manage your &lt;strong&gt;Product Owner’s expectations&lt;/strong&gt;.&lt;/p&gt;

&lt;p&gt;If a PO asks for a feature that is technically "dirty" or insecure to save time, a Senior Engineer says:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"I understand the deadline for the Payment API is Friday. However, skipping the VPC Endpoint setup creates a security risk that violates our PCI-DSS compliance. I suggest we deploy a 'Minimum Viable Environment' now and automate the full hardening by next sprint."&lt;/em&gt;&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;5. The Interview Narrative: Talking About Your Team&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Interviewer:&lt;/strong&gt; &lt;em&gt;"Tell me about your team structure at the bank."&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Senior Response:&lt;/strong&gt;&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"I was part of a high-performing **Platform Squad of 7 people&lt;/em&gt;* within the Central Platform Group at Nexus Bank. My squad consisted of a Product Owner, a Scrum Master, and five engineers of varying seniority. We operated on 2-week sprints. While our squad's focus was the &lt;strong&gt;Apex ECS Platform&lt;/strong&gt;, I frequently collaborated with the &lt;strong&gt;Security Chapter&lt;/strong&gt; to ensure our IAM roles met the bank’s zero-trust standards. A big part of my daily role was also 'managing' our relationship with the &lt;strong&gt;Application Teams&lt;/strong&gt;; I acted as a consultant to help them optimize their containerized workloads for our platform."*&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Why this wins the interview:&lt;/strong&gt;
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Terminology:&lt;/strong&gt; Using words like "Squad," "Chapter," and "Zero-Trust" shows you know the enterprise language.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Scale:&lt;/strong&gt; It shows you understand how your small team fits into a 50-person group and a 500-person developer org.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Leadership:&lt;/strong&gt; It highlights that you consult and negotiate, rather than just taking orders.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;To move from a "strong candidate" to a "must-hire senior engineer," you need  the &lt;strong&gt;Governance and Cultural&lt;/strong&gt; layers. In a bank, the "invisible" work isn't just technical—it's about &lt;strong&gt;compliance, risk mitigation, and technical debt.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;To make this project truly "interview-proof," "invisible" pillars is a curriculum.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;1. The "Invisible" Governance: Change Management&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;In a bank, you never just run &lt;code&gt;terraform apply&lt;/code&gt; from your laptop to Production. That is a firing offense.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;The Workflow:&lt;/strong&gt; you must describe a &lt;strong&gt;GitOps pipeline&lt;/strong&gt;.&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;The Process:&lt;/strong&gt; Code is pushed to a Feature Branch -&amp;gt; &lt;strong&gt;Pull Request (PR)&lt;/strong&gt; is opened -&amp;gt; &lt;strong&gt;Automated Linting &amp;amp; Security Scanning&lt;/strong&gt; (like &lt;code&gt;tfsec&lt;/code&gt; or &lt;code&gt;checkov&lt;/code&gt;) runs -&amp;gt; &lt;strong&gt;Peer Review&lt;/strong&gt; by another Senior Engineer -&amp;gt; Approval -&amp;gt; Merge to Main -&amp;gt; &lt;strong&gt;Jenkins/GitHub Actions&lt;/strong&gt; deploys to Staging -&amp;gt; &lt;strong&gt;CAB (Change Advisory Board)&lt;/strong&gt; approval -&amp;gt; Deployment to Production.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Interview Tip:&lt;/strong&gt; When asked about deployments, you should say: &lt;em&gt;"We treat our infrastructure as code with a strict 4-eye principle. No change reaches the Apex environment without a peer-reviewed PR and a successful automated security scan."&lt;/em&gt;&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;2. The "Invisible" Safety Net: Disaster Recovery (DR)&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;A 6-year veteran knows that things &lt;strong&gt;will&lt;/strong&gt; fail. They don't just build for uptime; they build for recovery.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;The Technical Task:&lt;/strong&gt; Add a "Cross-Region Backup" component to the project.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;RDS Snapshots:&lt;/strong&gt; Use Terraform to automate encrypted RDS snapshots being copied to a different AWS Region (e.g., from &lt;code&gt;us-east-1&lt;/code&gt; to &lt;code&gt;us-west-2&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Route 53 Failover:&lt;/strong&gt; Describe how they would use a &lt;strong&gt;Health Check&lt;/strong&gt; to flip traffic to a "Maintenance Page" or a DR cluster if the main ECS service fails.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Interview Tip:&lt;/strong&gt; If asked about reliability,  say: &lt;em&gt;"My priority for Project Apex was the **RTO (Recovery Time Objective)&lt;/em&gt;&lt;em&gt;. I architected the Terraform modules to be region-agnostic, allowing us to spin up the entire payment stack in a secondary region within 30 minutes if a provider outage occurs."&lt;/em&gt;&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;3. The "Invisible" Conflict: Technical Negotiation&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;Senior engineers spend a lot of time saying "No" in a way that helps the business.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;The Scenario:&lt;/strong&gt; A Developer wants "Admin" access to the ECS cluster to "debug faster."&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The Senior Solution:&lt;/strong&gt; Instead of giving Admin access, you provide them with &lt;strong&gt;CloudWatch Logs Insights&lt;/strong&gt; and a &lt;strong&gt;Read-Only&lt;/strong&gt; role.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Interview Tip:&lt;/strong&gt; This demonstrates leadership. &lt;em&gt;"I had a situation where the Dev team was frustrated by restrictive IAM roles. Instead of compromising the bank's security, I built a custom Grafana dashboard that gave them 100% visibility into their container logs and performance metrics, removing their need for direct cluster access."&lt;/em&gt;
&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;The "Final Boss" Interview Scenario&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;The Question:&lt;/strong&gt; &lt;em&gt;"We noticed your Apex Project uses ECS on EC2. Why didn't you just use Fargate to reduce operational overhead?"&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The "6-Year Veteran" Answer:&lt;/strong&gt;&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"That's a great question. While Fargate reduces the 'server management' burden, for a **Banking Payment Gateway&lt;/em&gt;&lt;em&gt;, we chose **ECS on EC2&lt;/em&gt;* for three strategic reasons:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Compliance:&lt;/strong&gt; We needed deep visibility into the OS level for our security agents and &lt;code&gt;auditd&lt;/code&gt; logging to meet PCI-DSS requirements.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Performance:&lt;/strong&gt; We required specific &lt;strong&gt;Linux kernel tuning&lt;/strong&gt; (&lt;code&gt;sysctl&lt;/code&gt;) to handle high-concurrency socket connections during peak transaction hours, which Fargate doesn't allow.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Cost at Scale:&lt;/strong&gt; Given the predictable 24/7 load of a payment gateway, using &lt;strong&gt;Reserved Instances&lt;/strong&gt; on EC2 saved the bank approximately 30% in cloud spend compared to Fargate's pricing model."*&lt;/li&gt;
&lt;/ol&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Updated Final Checklist&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;To ensure you are  "Cloud Platform Engineers,"  be able to produce:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;[ ] &lt;strong&gt;A "Peer Review" Checklist:&lt;/strong&gt; What do they look for when reviewing a teammate's Terraform code? (e.g., Are variables hardcoded? Is the state locked?).&lt;/li&gt;
&lt;li&gt;[ ] &lt;strong&gt;A Security Posture Document:&lt;/strong&gt; A 1-page PDF explaining how "Project Apex" protects customer data (Encryption at rest/transit, IAM roles, WAF).&lt;/li&gt;
&lt;li&gt;[ ] &lt;strong&gt;A "Post-Mortem" Template:&lt;/strong&gt; A document they filled out after their "Simulated Incident."&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;To truly give you the confidence of a &lt;strong&gt;6-year veteran&lt;/strong&gt;, you need to understand the "invisible" parts of the job: the complex environment and the constant communication required in a high-stakes banking setting.&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Part 1: The Deep Environment (The "Nexus Bank" Ecosystem)&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;In a bank, you never work in a single AWS account. You work in a &lt;strong&gt;Multi-Account Landing Zone&lt;/strong&gt;.&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;1. The Multi-Account Strategy&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;The project exists across four distinct AWS Accounts to ensure a small "blast radius":&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Security Account:&lt;/strong&gt; Centralized IAM, CloudTrail logs, and GuardDuty alerts.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Shared Services Account:&lt;/strong&gt; Where the &lt;strong&gt;Prometheus/Grafana&lt;/strong&gt; stack lives, along with the &lt;strong&gt;ECR&lt;/strong&gt; (Elastic Container Registry) and CI/CD tools.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Non-Prod Account:&lt;/strong&gt; For Dev and UAT (User Acceptance Testing) environments.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Production Account:&lt;/strong&gt; The "Holy Grail." Access is extremely restricted (No manual changes allowed).&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;2. The Network Topology&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Hybrid Cloud:&lt;/strong&gt; The bank has "on-prem" data centers. You manage the &lt;strong&gt;AWS Direct Connect&lt;/strong&gt; or &lt;strong&gt;Site-to-Site VPN&lt;/strong&gt; that connects the ECS tasks to the bank's legacy mainframe databases.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Traffic Flow:&lt;/strong&gt; External traffic hits an &lt;strong&gt;AWS WAF&lt;/strong&gt; (Web Application Firewall), then a &lt;strong&gt;Public Application Load Balancer&lt;/strong&gt;, which routes traffic through a &lt;strong&gt;Transit Gateway&lt;/strong&gt; to the &lt;strong&gt;ECS Services&lt;/strong&gt; sitting in private subnets.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;3. Compliance &amp;amp; Governance&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;PCI-DSS:&lt;/strong&gt; Every line of Terraform must be written with the Payment Card Industry Data Security Standard in mind (e.g., rotating secrets every 90 days, encrypting all S3 buckets).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Drift Detection:&lt;/strong&gt; You use &lt;strong&gt;AWS Config&lt;/strong&gt; to ensure that if someone manually changes a security group, you get an alert immediately.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Part 2: The Daily Collaboration (Who you meet and why)&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;A Senior Cloud Platform Engineer spends 40% of their time coding and 60% of their time &lt;strong&gt;aligning with other humans.&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Daily Rhythm&lt;/strong&gt;
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Time&lt;/th&gt;
&lt;th&gt;Meeting / Activity&lt;/th&gt;
&lt;th&gt;Who is there?&lt;/th&gt;
&lt;th&gt;The Goal&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;09:30 AM&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;The Daily Stand-up&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Platform Team, Scrum Master&lt;/td&gt;
&lt;td&gt;Update on your Terraform PRs (Pull Requests). "I'm blocked on the firewall rules for the new RDS cluster."&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;11:00 AM&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;Security Review&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Security Architect, Compliance Officer&lt;/td&gt;
&lt;td&gt;Presenting your infrastructure design. You must prove that your ECS tasks are isolated and that logs are being shipped to the Security Account.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;01:30 PM&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;App-Dev "Office Hours"&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Software Engineers (Java/Node.js)&lt;/td&gt;
&lt;td&gt;Helping developers who are struggling to get their containers to run in ECS. You explain how to optimize their &lt;code&gt;Dockerfile&lt;/code&gt;.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;03:00 PM&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;Sprint Refinement&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Product Owner, SREs&lt;/td&gt;
&lt;td&gt;Looking at the backlog. Deciding if the next priority is "Automated Scaling" or "Disaster Recovery Testing."&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Ad-hoc&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;Incident War Room&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Network Engineers, DBAs&lt;/td&gt;
&lt;td&gt;(Only if something breaks) Troubleshooting why the latency between ECS and the on-prem database spiked.&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h3&gt;
  
  
  &lt;strong&gt;The Key Stakeholders (The "Partners")&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;To sound like an expert, students must use these titles and understand these relationships:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;1. The Security Engineer:&lt;/strong&gt; Your "Best Friend/Worst Enemy." They will audit every IAM role you create. You collaborate with them to ensure the &lt;strong&gt;ECS Task Execution Role&lt;/strong&gt; has the minimum permissions possible.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;2. The Network Engineer:&lt;/strong&gt; They manage the "pipes" between AWS and the Bank. You meet with them to request &lt;strong&gt;Route 53&lt;/strong&gt; changes or to open ports in the corporate firewall.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;3. The Software Architect:&lt;/strong&gt; They care about performance. You meet with them to discuss how &lt;strong&gt;Prometheus metrics&lt;/strong&gt; can help them find bottlenecks in their Java code.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;4. The Database Administrator (DBA):&lt;/strong&gt; Since you are managing the &lt;strong&gt;RDS instances&lt;/strong&gt; via Terraform, you collaborate with them on "Maintenance Windows" and "Storage Auto-scaling."&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Part 3: "A Day in the Life" (The Interview Narrative)&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;If a student is asked, &lt;em&gt;"What did you do yesterday?"&lt;/em&gt;, a 6-year senior response sounds like this:&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;em&gt;"Yesterday morning, I started by reviewing the **Grafana&lt;/em&gt;* dashboards for our Apex Payment Gateway; we noticed a slight increase in 5xx errors after the last deployment. I met with the &lt;strong&gt;Network Team&lt;/strong&gt; to verify our Transit Gateway limits. After the Daily Stand-up, I spent the afternoon refactoring our &lt;strong&gt;Terraform modules&lt;/strong&gt; to implement 'Blue/Green' deployments for our ECS services. This was a request from the &lt;strong&gt;Security Team&lt;/strong&gt; to ensure we can roll back instantly if a compliance check fails. I finished the day by mentor-reviewing a junior engineer's Dockerfile to improve layer caching."*&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Why this works:&lt;/strong&gt;
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;It mentions the tools&lt;/strong&gt; (Grafana, Transit Gateway, Terraform, ECS, Docker).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;It mentions the people&lt;/strong&gt; (Network Team, Security Team, Junior Engineer).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;It shows the "Senior" mindset&lt;/strong&gt; (Refactoring, Compliance, Mentoring, Monitoring).&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;This project is designed to simulate a high-level environment at a major financial institution. To represent six years of experience, the focus isn't just on "how to use a tool," but on &lt;strong&gt;architecture, security, governance, and cross-team collaboration.&lt;/strong&gt;&lt;/p&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;The Persona: Senior Cloud Platform Engineer&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;In this project, you are a &lt;strong&gt;Senior Cloud Platform Engineer&lt;/strong&gt; at &lt;strong&gt;Nexus Bank&lt;/strong&gt;, a global retail and investment bank. You aren't just "doing DevOps"—you are building the foundation that allows 500+ developers to ship code safely.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Who you are:&lt;/strong&gt; A technical leader who bridges the gap between software development and infrastructure. You treat infrastructure as a product.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Your Mission:&lt;/strong&gt; To automate the lifecycle of the bank’s "Core Payment Gateway" to ensure 99.99% availability and strict regulatory compliance.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Company Profile: Nexus Bank&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Sector:&lt;/strong&gt; FinTech / Global Banking.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Infrastructure:&lt;/strong&gt; 100% AWS (Multi-region for Disaster Recovery).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Engineering Scale:&lt;/strong&gt; 25+ Product Teams (e.g., Credit Cards, Mortgages, Mobile App).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Platform Team:&lt;/strong&gt; You are part of the &lt;strong&gt;Central Platform Group (CPG)&lt;/strong&gt;, consisting of 12 engineers divided into three sub-squads: &lt;em&gt;Compute, Security/Identity, and Observability.&lt;/em&gt;
&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;The Core Project: "Project Sentinel"&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Objective:&lt;/strong&gt; Architect and deploy a PCI-DSS compliant, multi-region payment processing platform using Infrastructure as Code (IaC).&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;1. The Technical Stack&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Infrastructure:&lt;/strong&gt; AWS (EKS, RDS, VPC, Transit Gateway, IAM, S3).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Provisioning:&lt;/strong&gt; &lt;strong&gt;Terraform&lt;/strong&gt; (using Terragrunt for multi-environment DRY code).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Operating System:&lt;/strong&gt; &lt;strong&gt;Linux (Amazon Linux 2 / Ubuntu)&lt;/strong&gt;—hardened for financial security.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Networking:&lt;/strong&gt; Deep VPC peering, Private Link for third-party payment APIs, and AWS WAF.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Observability:&lt;/strong&gt; &lt;strong&gt;Prometheus&lt;/strong&gt; for metric collection and &lt;strong&gt;Grafana&lt;/strong&gt; for executive and technical dashboards.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;2. Your Responsibilities&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Architecting Guardrails:&lt;/strong&gt; Writing Terraform modules that prevent developers from creating "public" databases or unencrypted buckets.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Performance Tuning:&lt;/strong&gt; Deep-diving into &lt;strong&gt;Linux kernel parameters&lt;/strong&gt; and networking throughput to reduce payment latency.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Cost Optimization:&lt;/strong&gt; Implementing "Scale-to-Zero" for non-prod environments to save cloud spend.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Security:&lt;/strong&gt; Managing secrets via AWS Secrets Manager and ensuring encryption at rest/transit.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Day-to-Day Activities &amp;amp; Collaboration&lt;/strong&gt;
&lt;/h2&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;A Typical Schedule&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;09:00 – 09:30:&lt;/strong&gt; &lt;strong&gt;The Triage.&lt;/strong&gt; Check Slack and PagerDuty. Review &lt;strong&gt;Grafana&lt;/strong&gt; "Golden Signals" (Latency, Errors, Traffic, Saturation) for the Payment Gateway.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;09:30 – 10:00:&lt;/strong&gt; &lt;strong&gt;Daily Stand-up.&lt;/strong&gt; Synchronize with your Platform squad. "Yesterday I finished the Terraform module for the new RDS cluster; today I'm troubleshooting a networking latency issue between the VPC and the On-prem data center."&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;10:00 – 12:30:&lt;/strong&gt; &lt;strong&gt;Deep Work (The "Engineering").&lt;/strong&gt; Writing Terraform to deploy a new EKS cluster across two regions. Debugging a &lt;strong&gt;Linux&lt;/strong&gt; networking issue where packets are dropping at the NAT Gateway.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;13:30 – 14:30:&lt;/strong&gt; &lt;strong&gt;Collaboration Meeting.&lt;/strong&gt; Meet with the &lt;strong&gt;Security Team&lt;/strong&gt; to review IAM policies and the &lt;strong&gt;App Dev Team&lt;/strong&gt; to help them containerize a new Java-based microservice.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;15:00 – 16:30:&lt;/strong&gt; &lt;strong&gt;Observability Sprints.&lt;/strong&gt; Configuring &lt;strong&gt;Prometheus&lt;/strong&gt; Alertmanager to send high-priority alerts to the SRE team if payment success rates drop below 98%.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;How You Solve Issues&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;When a "Severity 1" (Production Down) issue arises:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Detection:&lt;/strong&gt; Grafana triggers an alert.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Isolation:&lt;/strong&gt; You use &lt;strong&gt;Linux command-line tools&lt;/strong&gt; (&lt;code&gt;tcpdump&lt;/code&gt;, &lt;code&gt;netstat&lt;/code&gt;, &lt;code&gt;top&lt;/code&gt;) and &lt;strong&gt;AWS CloudWatch&lt;/strong&gt; to see if it’s a network bottleneck or a code bug.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Collaboration:&lt;/strong&gt; You open a "War Room" (Zoom/Teams) with the Database Admins and Network Engineers.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Resolution:&lt;/strong&gt; You apply a fix via Terraform (never manually in the console!) to ensure the fix is permanent and documented.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Post-Mortem:&lt;/strong&gt; You lead a meeting to discuss &lt;em&gt;why&lt;/em&gt; it happened and how to automate the prevention of it.&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Interview Simulation: Key Questions &amp;amp; Responses&lt;/strong&gt;
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Question&lt;/th&gt;
&lt;th&gt;The "6-Year Experience" Answer&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;"How do you manage state in Terraform?"&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;"At Nexus Bank, we use S3 backends with DynamoDB for state locking. I've implemented a modular structure where 'State' is separated by environment and region to minimize the blast radius of any single change."&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;"Explain a complex Linux issue you solved."&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;"I once diagnosed a high 'iowait' issue on a production database node. By using &lt;code&gt;iostat&lt;/code&gt; and &lt;code&gt;strace&lt;/code&gt;, I found that a legacy logging script was saturating the disk I/O. I moved the logs to a separate EBS volume and adjusted the &lt;code&gt;sysctl&lt;/code&gt; swappiness parameters."&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;"How do you handle security in AWS?"&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;"I follow the Principle of Least Privilege. I use IAM Roles for Service Accounts (IRSA) in EKS, ensuring that pods only have access to the specific S3 buckets or RDS instances they need, rather than using broad node-level permissions."&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Project Implementation Steps&lt;/strong&gt;
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Network Setup:&lt;/strong&gt; Use Terraform to create a multi-AZ VPC with private and public subnets.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Security Hardening:&lt;/strong&gt; Provision an EC2 instance, log in via SSH, and harden the Linux OS (disable root login, setup fail2ban, optimize networking stack).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Infrastructure:&lt;/strong&gt; Deploy an AWS EKS cluster using Terraform modules.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Monitoring:&lt;/strong&gt; Install Prometheus and Grafana using Helm. Create a dashboard that visualizes CPU/Memory and custom application metrics.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The "Fix":&lt;/strong&gt; Simulate a failure (e.g., shut down a subnet) and document how you would use your tools to find and fix it.&lt;/li&gt;
&lt;/ol&gt;

&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;Expert Tip:&lt;/strong&gt; In an interview, don't just talk about the tools. Talk about &lt;strong&gt;why&lt;/strong&gt; you chose them. For a bank, the answer is always &lt;strong&gt;Security, Scalability, and Auditability.&lt;/strong&gt;&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h2&gt;
  
  
  &lt;strong&gt;Project Persona: Senior Cloud Platform Engineer (ECS Stack)&lt;/strong&gt;
&lt;/h2&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Architecture: "Project Apex"&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;You are building a high-availability Payment Processing API. This isn't just a single container; it’s a distributed system with a focus on &lt;strong&gt;Security, Observability, and Network Isolation.&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;The Technical Stack&lt;/strong&gt;
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Orchestration:&lt;/strong&gt; AWS ECS (using EC2 Launch Type for deep OS control).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Infrastructure:&lt;/strong&gt; Terraform (Modularized for Multi-environment).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;OS:&lt;/strong&gt; Amazon Linux 2 (Heavily hardened).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Networking:&lt;/strong&gt; VPC with Private Subnets, NAT Gateways, and Application Load Balancers (ALB).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Observability:&lt;/strong&gt; Prometheus &amp;amp; Grafana (running as ECS Services).&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Step-by-Step Implementation Plan&lt;/strong&gt;
&lt;/h2&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Phase 1: The Network Backbone (Terraform &amp;amp; Networking)&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;In a bank, you never put a database or an application server in a public subnet.&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;VPC Design:&lt;/strong&gt; Use Terraform to create a VPC with 6 subnets (2 Public for ALBs, 2 Private for ECS Tasks, 2 Isolated for RDS).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Connectivity:&lt;/strong&gt; Set up &lt;strong&gt;VPC Endpoints&lt;/strong&gt; (Interface Endpoints) for ECS, ECR, and S3. This ensures your traffic stays within the AWS network and never touches the public internet—a major banking requirement.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Security Groups:&lt;/strong&gt; Define strict "Chain of Trust" rules. &lt;em&gt;Example:&lt;/em&gt; The Database SG only allows traffic from the ECS Task SG on port 5432.&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  &lt;strong&gt;Phase 2: The "Hardened" Host (Linux &amp;amp; Security)&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;Since you are using the EC2 launch type, you are responsible for the "Security of the Cloud."&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Auto Scaling Group (ASG):&lt;/strong&gt; Use Terraform to create an ASG that registers instances into your ECS Cluster.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Golden Image:&lt;/strong&gt; Use &lt;strong&gt;User Data scripts&lt;/strong&gt; in Terraform to harden the Linux OS on boot (e.g., updating packages, installing the CloudWatch agent, and setting &lt;code&gt;sysctl&lt;/code&gt; parameters for networking optimization).&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;IAM Roles:&lt;/strong&gt; Create an &lt;strong&gt;ECS Instance Role&lt;/strong&gt; and a separate &lt;strong&gt;ECS Task Execution Role&lt;/strong&gt;. This demonstrates knowledge of granular security.&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  &lt;strong&gt;Phase 3: Containerizing the Bank (ECS &amp;amp; Docker)&lt;/strong&gt;
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Task Definitions:&lt;/strong&gt; Write JSON or Terraform-based Task Definitions.&lt;/li&gt;
&lt;li&gt;Include &lt;strong&gt;Log Configuration&lt;/strong&gt; (sending logs to CloudWatch).&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Set &lt;strong&gt;Resource Limits&lt;/strong&gt; (CPU/Memory) to prevent one container from crashing the whole EC2 host.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Service Discovery:&lt;/strong&gt; Use &lt;strong&gt;AWS Cloud Map&lt;/strong&gt; so your microservices can find each other via internal DNS (e.g., &lt;code&gt;payment.service.local&lt;/code&gt;).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Secrets Management:&lt;/strong&gt; Inject API keys and DB passwords directly from &lt;strong&gt;AWS Secrets Manager&lt;/strong&gt; into environment variables securely.&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  &lt;strong&gt;Phase 4: The "Eyes" of the System (Prometheus &amp;amp; Grafana)&lt;/strong&gt;
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Sidecar Pattern:&lt;/strong&gt; Deploy the &lt;strong&gt;AWS Distro for OpenTelemetry (ADOT)&lt;/strong&gt; as a sidecar container in your ECS tasks to scrape metrics.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Prometheus Service:&lt;/strong&gt; Run Prometheus as an ECS service with an EBS volume attached for data persistence.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Grafana Dashboards:&lt;/strong&gt; Connect Grafana to Prometheus. Build a dashboard that monitors:&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;ECS Cluster Health:&lt;/strong&gt; CPU/Memory Reservation vs. Utilization.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Networking:&lt;/strong&gt; Active connections on the ALB.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Linux Stats:&lt;/strong&gt; Disk I/O and Network TX/RX on the EC2 hosts.&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  &lt;strong&gt;Phase 5: Day-to-Day Operations &amp;amp; Collaboration&lt;/strong&gt;
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Activity&lt;/th&gt;
&lt;th&gt;Senior Level Execution&lt;/th&gt;
&lt;th&gt;Collaboration Partner&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Capacity Planning&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Analyzing Grafana trends to decide if the ASG needs to scale out before a big sale.&lt;/td&gt;
&lt;td&gt;Finance/Product Team&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Troubleshooting&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Investigating "502 Bad Gateway" errors by checking ALB logs and using &lt;code&gt;tcpdump&lt;/code&gt; on the EC2 host.&lt;/td&gt;
&lt;td&gt;App Developers&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Security Audit&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Reviewing IAM policies to ensure no "wildcard" permissions exist.&lt;/td&gt;
&lt;td&gt;Security/Compliance&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Deployment&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Updating the Terraform code to roll out a new version of the app using &lt;strong&gt;Blue/Green Deployment&lt;/strong&gt;.&lt;/td&gt;
&lt;td&gt;DevOps/Release Team&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;The "Interview Ready" Scenario: The Incident&lt;/strong&gt;
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;The Problem:&lt;/strong&gt; The "Mortgage Processing" service is running slow. Customers are complaining.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;How the Senior Engineer Solves it:&lt;/strong&gt;&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Check Grafana:&lt;/strong&gt; You notice "CPU Steal" is high on the EC2 hosts.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Linux Deep Dive:&lt;/strong&gt; You SSH into the Bastion, then to the EC2 host. You run &lt;code&gt;top&lt;/code&gt; and &lt;code&gt;htop&lt;/code&gt; to see which process is hogging resources.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Discovery:&lt;/strong&gt; You find a "noisy neighbor"—a non-critical logging container is consuming more CPU than allowed.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The Fix:&lt;/strong&gt; You don't just kill the container. You go to &lt;strong&gt;Terraform&lt;/strong&gt;, update the Task Definition to include strict &lt;code&gt;cpu_shares&lt;/code&gt;, and run &lt;code&gt;terraform apply&lt;/code&gt;.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;The Report:&lt;/strong&gt; You document how the resource limits prevented a total outage and suggest moving non-critical tasks to a separate ECS cluster.&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  &lt;strong&gt;Final Deliverable Checklist for Students&lt;/strong&gt;
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;[ ] &lt;strong&gt;Terraform Code:&lt;/strong&gt; Organized by &lt;code&gt;modules/&lt;/code&gt;, &lt;code&gt;environments/&lt;/code&gt;, and &lt;code&gt;main.tf&lt;/code&gt;.&lt;/li&gt;
&lt;li&gt;[ ] &lt;strong&gt;Network Diagram:&lt;/strong&gt; Showing the flow from User -&amp;gt; ALB -&amp;gt; ECS Task -&amp;gt; RDS.&lt;/li&gt;
&lt;li&gt;[ ] &lt;strong&gt;Monitoring Snapshot:&lt;/strong&gt; A screenshot of their Grafana dashboard during a "Load Test."&lt;/li&gt;
&lt;li&gt;[ ] &lt;strong&gt;CLI Competency:&lt;/strong&gt; Ability to explain the difference between &lt;code&gt;docker top&lt;/code&gt; and Linux &lt;code&gt;top&lt;/code&gt; in the context of ECS.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This project covers every base: &lt;strong&gt;Infrastructure as Code (Terraform)&lt;/strong&gt;, &lt;strong&gt;Cloud Architecture (AWS)&lt;/strong&gt;, &lt;strong&gt;Deep Systems Knowledge (Linux/Networking)&lt;/strong&gt;, and &lt;strong&gt;Operational Excellence (Prometheus/Grafana)&lt;/strong&gt;.&lt;/p&gt;

</description>
    </item>
    <item>
      <title>SRE Monitoring Lab 2</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Wed, 13 May 2026 02:50:37 +0000</pubDate>
      <link>https://dev.to/jumptotech/sre-monitoring-lab-2-536</link>
      <guid>https://dev.to/jumptotech/sre-monitoring-lab-2-536</guid>
      <description>&lt;h1&gt;
  
  
  Prometheus Alerting + AlertManager + Grafana Alerts
&lt;/h1&gt;

&lt;p&gt;Now you will build a REAL SRE alerting system.&lt;/p&gt;

&lt;p&gt;In production, monitoring without alerts is useless.&lt;/p&gt;

&lt;p&gt;This lab teaches you:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;What alerts are&lt;/li&gt;
&lt;li&gt;How SRE engineers detect failures&lt;/li&gt;
&lt;li&gt;How Prometheus rules work&lt;/li&gt;
&lt;li&gt;How AlertManager works&lt;/li&gt;
&lt;li&gt;How alert routing works&lt;/li&gt;
&lt;li&gt;How to troubleshoot alerts&lt;/li&gt;
&lt;li&gt;How incidents are detected in production&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;You already have:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;li&gt;Node Exporter&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;running on your EC2 Ubuntu machine.&lt;/p&gt;




&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST UNDERSTAND FIRST
&lt;/h1&gt;

&lt;h1&gt;
  
  
  Why Monitoring Alone Is Not Enough
&lt;/h1&gt;

&lt;p&gt;Dashboards are passive.&lt;/p&gt;

&lt;p&gt;SRE engineers cannot stare at dashboards 24/7.&lt;/p&gt;

&lt;p&gt;Instead:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="1o63gz"&lt;br&gt;
System detects problems automatically&lt;br&gt;
→ sends alerts&lt;br&gt;
→ engineers respond&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# REAL PRODUCTION FLOW



```text id="j1lszy"
Node Exporter
      ↓
Prometheus scrapes metrics
      ↓
Alert Rules evaluate metrics
      ↓
AlertManager receives alerts
      ↓
Email / Slack / PagerDuty
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  WHAT IS ALERTMANAGER?
&lt;/h1&gt;

&lt;p&gt;Alertmanager handles alerts from Prometheus.&lt;/p&gt;

&lt;p&gt;It:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;groups alerts&lt;/li&gt;
&lt;li&gt;deduplicates alerts&lt;/li&gt;
&lt;li&gt;routes alerts&lt;/li&gt;
&lt;li&gt;sends notifications&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST KNOW
&lt;/h1&gt;

&lt;p&gt;Prometheus:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;stores metrics&lt;/li&gt;
&lt;li&gt;evaluates rules&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;AlertManager:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;handles notifications&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Grafana:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;visualization&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  LAB GOAL
&lt;/h1&gt;

&lt;p&gt;You will create:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Alert&lt;/th&gt;
&lt;th&gt;Trigger&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;High CPU&lt;/td&gt;
&lt;td&gt;CPU &amp;gt; 70%&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Node Down&lt;/td&gt;
&lt;td&gt;Exporter unavailable&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;High Memory&lt;/td&gt;
&lt;td&gt;RAM low&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Then intentionally break system and watch alerts fire.&lt;/p&gt;


&lt;h1&gt;
  
  
  ARCHITECTURE
&lt;/h1&gt;



&lt;p&gt;```text id="6utj9c"&lt;br&gt;
Node Exporter&lt;br&gt;
     ↓&lt;br&gt;
Prometheus&lt;br&gt;
     ↓&lt;br&gt;
AlertManager&lt;br&gt;
     ↓&lt;br&gt;
Alert UI&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 1 — DOWNLOAD ALERTMANAGER

SSH into EC2.

Go to:



```bash id="qg7s8h"
cd /tmp
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Download:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="vgkg4j"&lt;br&gt;
wget &lt;a href="https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz" rel="noopener noreferrer"&gt;https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 2 — EXTRACT FILE



```bash id="gtu08m"
tar -xvf alertmanager-0.28.1.linux-amd64.tar.gz
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 3 — MOVE BINARIES
&lt;/h1&gt;



&lt;p&gt;```bash id="azebn4"&lt;br&gt;
sudo mv alertmanager-0.28.1.linux-amd64/alertmanager /usr/local/bin/&lt;/p&gt;

&lt;p&gt;sudo mv alertmanager-0.28.1.linux-amd64/amtool /usr/local/bin/&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT IS AMTOOL?

CLI tool for testing AlertManager.

---

# STEP 4 — CREATE USER



```bash id="3ib5q9"
sudo useradd -rs /bin/false alertmanager
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 5 — CREATE DIRECTORIES
&lt;/h1&gt;



&lt;p&gt;```bash id="8dv8bd"&lt;br&gt;
sudo mkdir /etc/alertmanager&lt;/p&gt;

&lt;p&gt;sudo mkdir /var/lib/alertmanager&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 6 — COPY CONFIG



```bash id="5q6l7r"
sudo cp /tmp/alertmanager-0.28.1.linux-amd64/alertmanager.yml /etc/alertmanager/
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 7 — SET OWNERSHIP
&lt;/h1&gt;



&lt;p&gt;```bash id="mgg5lr"&lt;br&gt;
sudo chown -R alertmanager:alertmanager /etc/alertmanager&lt;/p&gt;

&lt;p&gt;sudo chown -R alertmanager:alertmanager /var/lib/alertmanager&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 8 — CREATE SYSTEMD SERVICE



```bash id="gtb4ny"
sudo nano /etc/systemd/system/alertmanager.service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Paste:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```ini id="w3ckzm"&lt;br&gt;
[Unit]&lt;br&gt;
Description=AlertManager&lt;br&gt;
After=network.target&lt;/p&gt;

&lt;p&gt;[Service]&lt;br&gt;
User=alertmanager&lt;br&gt;
Group=alertmanager&lt;br&gt;
Type=simple&lt;br&gt;
ExecStart=/usr/local/bin/alertmanager \&lt;br&gt;
  --config.file=/etc/alertmanager/alertmanager.yml \&lt;br&gt;
  --storage.path=/var/lib/alertmanager&lt;/p&gt;

&lt;p&gt;[Install]&lt;br&gt;
WantedBy=multi-user.target&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Save file.

---

# STEP 9 — START ALERTMANAGER



```bash id="6f9ok5"
sudo systemctl daemon-reload

sudo systemctl enable alertmanager

sudo systemctl start alertmanager
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 10 — VERIFY SERVICE
&lt;/h1&gt;



&lt;p&gt;```bash id="vc7qop"&lt;br&gt;
sudo systemctl status alertmanager&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="e3h2vj"
active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 11 — OPEN SECURITY GROUP
&lt;/h1&gt;

&lt;p&gt;Go to AWS Console.&lt;/p&gt;


&lt;h2&gt;
  
  
  EC2 → Security Groups
&lt;/h2&gt;

&lt;p&gt;Add:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Port&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;9093&lt;/td&gt;
&lt;td&gt;AlertManager&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Source:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="5h4yfx"&lt;br&gt;
0.0.0.0/0&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 12 — VERIFY ALERTMANAGER UI

Browser:



```text id="rqj3uw"
http://YOUR_PUBLIC_IP:9093
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see AlertManager UI.&lt;/p&gt;


&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST KNOW
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Port&lt;/th&gt;
&lt;th&gt;Service&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;9090&lt;/td&gt;
&lt;td&gt;Prometheus&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;9093&lt;/td&gt;
&lt;td&gt;AlertManager&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;9100&lt;/td&gt;
&lt;td&gt;Node Exporter&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;3000&lt;/td&gt;
&lt;td&gt;Grafana&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  STEP 13 — CONNECT PROMETHEUS TO ALERTMANAGER
&lt;/h1&gt;

&lt;p&gt;Edit Prometheus config:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="kzkbbo"&lt;br&gt;
sudo nano /etc/prometheus/prometheus.yml&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Add:



```yaml id="qj4vdf"
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - localhost:9093
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 14 — ADD RULE FILE
&lt;/h1&gt;

&lt;p&gt;In same file add:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```yaml id="c81l7s"&lt;br&gt;
rule_files:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;"alert.rules.yml"
```
&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  FULL STRUCTURE
&lt;/h1&gt;



&lt;p&gt;```yaml id="vl5vlx"&lt;br&gt;
global:&lt;br&gt;
  scrape_interval: 15s&lt;/p&gt;

&lt;p&gt;rule_files:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;"alert.rules.yml"&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;alerting:&lt;br&gt;
  alertmanagers:&lt;br&gt;
    - static_configs:&lt;br&gt;
        - targets:&lt;br&gt;
          - localhost:9093&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 15 — CREATE ALERT RULE FILE



```bash id="3xkq4y"
sudo nano /etc/prometheus/alert.rules.yml
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Paste:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```yaml id="q4ly2k"&lt;br&gt;
groups:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;name: node_alerts&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;rules:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;alert: NodeDown&lt;br&gt;
expr: up == 0&lt;br&gt;
for: 1m&lt;br&gt;
labels:&lt;br&gt;
  severity: critical&lt;br&gt;
annotations:&lt;br&gt;
  summary: "Node Exporter Down"&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;alert: HighCPUUsage&lt;br&gt;
expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) &amp;gt; 70&lt;br&gt;
for: 1m&lt;br&gt;
labels:&lt;br&gt;
  severity: warning&lt;br&gt;
annotations:&lt;br&gt;
  summary: "High CPU Usage"&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;alert: HighMemoryUsage&lt;br&gt;
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 &amp;gt; 80&lt;br&gt;
for: 1m&lt;br&gt;
labels:&lt;br&gt;
  severity: warning&lt;br&gt;
annotations:&lt;br&gt;
  summary: "High Memory Usage"&lt;/p&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
&lt;/code&gt;&lt;/pre&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Save file.&lt;/p&gt;


&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST UNDERSTAND
&lt;/h1&gt;
&lt;h1&gt;
  
  
  ALERT RULE STRUCTURE
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Field&lt;/th&gt;
&lt;th&gt;Meaning&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;alert&lt;/td&gt;
&lt;td&gt;Alert name&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;expr&lt;/td&gt;
&lt;td&gt;PromQL expression&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;for&lt;/td&gt;
&lt;td&gt;Duration&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;labels&lt;/td&gt;
&lt;td&gt;Severity&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;annotations&lt;/td&gt;
&lt;td&gt;Human-readable info&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  WHAT IS PROMQL?
&lt;/h1&gt;

&lt;p&gt;Prometheus Query Language.&lt;/p&gt;

&lt;p&gt;Used for:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;dashboards&lt;/li&gt;
&lt;li&gt;alerts&lt;/li&gt;
&lt;li&gt;troubleshooting&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  CPU ALERT EXPLANATION
&lt;/h1&gt;

&lt;p&gt;This calculates:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwm0v"&lt;br&gt;
Non-idle CPU percentage&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Meaning:



```text id="l2p8lz"
Actual CPU usage
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 16 — VERIFY CONFIGURATION
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="zhb2nf"&lt;br&gt;
promtool check config /etc/prometheus/prometheus.yml&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="rjlwmf"
SUCCESS
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 17 — RESTART PROMETHEUS
&lt;/h1&gt;



&lt;p&gt;```bash id="2y7z4u"&lt;br&gt;
sudo systemctl restart prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 18 — VERIFY ALERTS PAGE

Browser:



```text id="1jlwmn"
http://YOUR_PUBLIC_IP:9090/alerts
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see alert rules.&lt;/p&gt;

&lt;p&gt;State:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="pu4lt6"&lt;br&gt;
Inactive&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 19 — TRIGGER CPU ALERT

Install stress tool if not installed:



```bash id="a8cnfx"
sudo apt install stress -y
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Generate load:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="v1q6gh"&lt;br&gt;
stress --cpu 4 --timeout 180&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT HAPPENS?

CPU rises.

Prometheus evaluates:



```text id="zgtzth"
CPU &amp;gt; 70%
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;After 1 minute:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="bxl09l"&lt;br&gt;
Alert fires&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 20 — WATCH ALERT

Go to:



```text id="s5fhzl"
http://YOUR_PUBLIC_IP:9090/alerts
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="5hktc4"&lt;br&gt;
FIRING&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 21 — CHECK ALERTMANAGER

Open:



```text id="hxtg9t"
http://YOUR_PUBLIC_IP:9093
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see alert there.&lt;/p&gt;


&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST KNOW
&lt;/h1&gt;

&lt;p&gt;Prometheus:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;evaluates rules&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;AlertManager:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;receives active alerts&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  STEP 22 — TEST NODE DOWN ALERT
&lt;/h1&gt;

&lt;p&gt;Stop exporter:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="djlwmq"&lt;br&gt;
sudo systemctl stop node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Wait 1 minute.

Refresh:



```text id="k1kvsn"
/alerts
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="fdjlwm"&lt;br&gt;
NodeDown FIRING&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# THIS IS REAL SRE INCIDENT DETECTION

Production example:



```text id="jlwmx8"
Kubernetes node dies
→ exporter unreachable
→ NodeDown fires
→ PagerDuty alerts SRE
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 23 — TROUBLESHOOT AS SRE
&lt;/h1&gt;


&lt;h1&gt;
  
  
  CHECK SERVICE
&lt;/h1&gt;



&lt;p&gt;```bash id="jlwm0c"&lt;br&gt;
sudo systemctl status node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# CHECK PORT



```bash id="3y5r5v"
ss -tulnp | grep 9100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  CHECK ENDPOINT
&lt;/h1&gt;



&lt;p&gt;```bash id="jlwm4y"&lt;br&gt;
curl localhost:9100/metrics&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# CHECK PROMETHEUS TARGETS



```text id="xjlwm9"
http://YOUR_PUBLIC_IP:9090/targets
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST MEMORIZE
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Command&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;systemctl&lt;/td&gt;
&lt;td&gt;services&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;journalctl&lt;/td&gt;
&lt;td&gt;logs&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;curl&lt;/td&gt;
&lt;td&gt;endpoint test&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ss&lt;/td&gt;
&lt;td&gt;ports&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;top&lt;/td&gt;
&lt;td&gt;CPU&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;free -m&lt;/td&gt;
&lt;td&gt;RAM&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;df -h&lt;/td&gt;
&lt;td&gt;disk&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  IMPORTANT SRE CONCEPTS
&lt;/h1&gt;
&lt;h1&gt;
  
  
  Alert Fatigue
&lt;/h1&gt;

&lt;p&gt;Too many alerts = engineers ignore alerts.&lt;/p&gt;

&lt;p&gt;SRE engineers carefully tune:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;thresholds&lt;/li&gt;
&lt;li&gt;durations&lt;/li&gt;
&lt;li&gt;severities&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Severity Levels
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Severity&lt;/th&gt;
&lt;th&gt;Meaning&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;critical&lt;/td&gt;
&lt;td&gt;immediate action&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;warning&lt;/td&gt;
&lt;td&gt;monitor&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;info&lt;/td&gt;
&lt;td&gt;informational&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  WHY "for: 1m"?
&lt;/h1&gt;

&lt;p&gt;Prevents temporary spikes causing alerts.&lt;/p&gt;

&lt;p&gt;Without it:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="jlwm1x"&lt;br&gt;
Tiny CPU spike&lt;br&gt;
→ false alert&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT REAL COMPANIES USE

| Tool         | Purpose       |
| ------------ | ------------- |
| AlertManager | routing       |
| PagerDuty    | on-call       |
| Opsgenie     | escalation    |
| Slack        | notifications |
| Email        | alerts        |

---

# REAL INTERVIEW QUESTIONS

# What is AlertManager?

Handles alerts from Prometheus.

---

# Difference Between Prometheus and AlertManager?

| Prometheus      | AlertManager         |
| --------------- | -------------------- |
| stores metrics  | handles alerts       |
| evaluates rules | routes notifications |

---

# What is PromQL?

Prometheus Query Language.

---

# Why use "for" in alerts?

Avoid false positives.

---

# What happens if Node Exporter stops?

Prometheus target becomes DOWN and alert fires.

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
    </item>
    <item>
      <title>SRE Monitoring Lab 1</title>
      <dc:creator>Aisalkyn Aidarova</dc:creator>
      <pubDate>Wed, 13 May 2026 02:49:24 +0000</pubDate>
      <link>https://dev.to/jumptotech/sre-monitoring-lab-1-2db1</link>
      <guid>https://dev.to/jumptotech/sre-monitoring-lab-1-2db1</guid>
      <description>&lt;h1&gt;
  
  
  Install Node Exporter + Connect Prometheus + Build Grafana Dashboard on AWS EC2
&lt;/h1&gt;

&lt;p&gt;This is your FIRST real observability/SRE production-style lab.&lt;/p&gt;

&lt;p&gt;You already have:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Prometheus&lt;/li&gt;
&lt;li&gt;Grafana&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;installed on one Ubuntu EC2 machine.&lt;/p&gt;

&lt;p&gt;Now you will:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Install Node Exporter&lt;/li&gt;
&lt;li&gt;Configure Prometheus scraping&lt;/li&gt;
&lt;li&gt;Connect Grafana&lt;/li&gt;
&lt;li&gt;Import production dashboard&lt;/li&gt;
&lt;li&gt;Generate load&lt;/li&gt;
&lt;li&gt;Analyze metrics&lt;/li&gt;
&lt;li&gt;Troubleshoot failures like real SRE engineers&lt;/li&gt;
&lt;/ol&gt;




&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST UNDERSTAND FIRST
&lt;/h1&gt;




&lt;h1&gt;
  
  
  What Is Monitoring?
&lt;/h1&gt;

&lt;p&gt;Monitoring means:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="w7r46n"&lt;br&gt;
Watching infrastructure and applications continuously.&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Example:

* Is CPU high?
* Is memory full?
* Is disk almost full?
* Is network overloaded?
* Is server healthy?
* Is service down?

---

# What Is Observability?

Observability means:



```text id="xl5k5f"
Understanding WHY something failed.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;SRE engineers use:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Metrics&lt;/td&gt;
&lt;td&gt;Numbers over time&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Logs&lt;/td&gt;
&lt;td&gt;Events/messages&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Traces&lt;/td&gt;
&lt;td&gt;Request flow&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Dashboards&lt;/td&gt;
&lt;td&gt;Visualization&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Alerts&lt;/td&gt;
&lt;td&gt;Notifications&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  HOW THIS LAB WORKS
&lt;/h1&gt;



&lt;p&gt;```text id="2h8l1u"&lt;br&gt;
Node Exporter&lt;br&gt;
    ↓&lt;br&gt;
Prometheus scrapes metrics&lt;br&gt;
    ↓&lt;br&gt;
Prometheus stores metrics&lt;br&gt;
    ↓&lt;br&gt;
Grafana visualizes metrics&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT IS NODE EXPORTER?

Node Exporter is a Linux metrics collector.

It exposes metrics on:



```text id="w97dhf"
PORT 9100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Example metrics:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="09f9ii"&lt;br&gt;
CPU usage&lt;br&gt;
Memory usage&lt;br&gt;
Disk usage&lt;br&gt;
Filesystem&lt;br&gt;
Processes&lt;br&gt;
Load average&lt;br&gt;
Network traffic&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT SRE ENGINEERS MUST KNOW

## Prometheus is PULL based

Meaning:



```text id="ic4qxu"
Prometheus goes and asks for metrics.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;NOT:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="u0wyoh"&lt;br&gt;
Server pushes metrics.&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Prometheus periodically scrapes:



```text id="vv4ybh"
http://target:9100/metrics
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  LAB ARCHITECTURE
&lt;/h1&gt;



&lt;p&gt;```text id="0nnml1"&lt;br&gt;
EC2 Ubuntu Instance&lt;br&gt;
│&lt;br&gt;
├── Prometheus :9090&lt;br&gt;
├── Grafana :3000&lt;br&gt;
└── Node Exporter :9100&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 1 — LOGIN TO AWS

Go to:

[AWS Console](https://console.aws.amazon.com?utm_source=chatgpt.com)

---

# STEP 2 — OPEN EC2

Click:



```text id="tyt5hv"
Services
→ EC2
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 3 — FIND YOUR INSTANCE
&lt;/h1&gt;

&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="v7y40j"&lt;br&gt;
Instances&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Find your Ubuntu server.

You should see:

| Column         | Example           |
| -------------- | ----------------- |
| Instance State | Running           |
| Public IPv4    | 3.xx.xx.xx        |
| Name           | monitoring-server |

---

# STEP 4 — CONNECT TO EC2

Select instance.

Click:



```text id="cgnv0y"
Connect
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Choose:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="gzwu0d"&lt;br&gt;
EC2 Instance Connect&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Click:



```text id="pw3kpa"
Connect
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Terminal opens.&lt;/p&gt;


&lt;h1&gt;
  
  
  STEP 5 — VERIFY PROMETHEUS
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="f1jlsm"&lt;br&gt;
sudo systemctl status prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="q0ovv5"
active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 6 — VERIFY GRAFANA
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="0v8cwe"&lt;br&gt;
sudo systemctl status grafana-server&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="z1r6cc"
active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 7 — INSTALL NODE EXPORTER
&lt;/h1&gt;


&lt;h2&gt;
  
  
  Go to /tmp
&lt;/h2&gt;



&lt;p&gt;```bash id="jupzwy"&lt;br&gt;
cd /tmp&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

## Download Node Exporter



```bash id="u1t3n6"
wget https://github.com/prometheus/node_exporter/releases/download/v1.9.1/node_exporter-1.9.1.linux-amd64.tar.gz
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;SRE understanding:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Part&lt;/th&gt;
&lt;th&gt;Meaning&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;wget&lt;/td&gt;
&lt;td&gt;download file&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;tar.gz&lt;/td&gt;
&lt;td&gt;compressed archive&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;linux-amd64&lt;/td&gt;
&lt;td&gt;Linux 64-bit version&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  STEP 8 — EXTRACT FILE
&lt;/h1&gt;



&lt;p&gt;```bash id="dbkewd"&lt;br&gt;
tar -xvf node_exporter-1.9.1.linux-amd64.tar.gz&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Meaning:

| Flag | Meaning |
| ---- | ------- |
| x    | extract |
| v    | verbose |
| f    | file    |

---

# STEP 9 — MOVE BINARY



```bash id="cz5yya"
sudo mv node_exporter-1.9.1.linux-amd64/node_exporter /usr/local/bin/
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;SRE understanding:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="11k31t"&lt;br&gt;
/usr/local/bin&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


stores executables.

---

# STEP 10 — CREATE SERVICE USER



```bash id="euhlkj"
sudo useradd -rs /bin/false node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  WHY?
&lt;/h1&gt;

&lt;p&gt;SRE engineers NEVER run services as root unless required.&lt;/p&gt;

&lt;p&gt;Security best practice:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="8c69v7"&lt;br&gt;
Least privilege&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 11 — CREATE SYSTEMD SERVICE

Run:



```bash id="o7pyvf"
sudo nano /etc/systemd/system/node_exporter.service
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Paste:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```ini id="txyh6y"&lt;br&gt;
[Unit]&lt;br&gt;
Description=Node Exporter&lt;br&gt;
After=network.target&lt;/p&gt;

&lt;p&gt;[Service]&lt;br&gt;
User=node_exporter&lt;br&gt;
Group=node_exporter&lt;br&gt;
Type=simple&lt;br&gt;
ExecStart=/usr/local/bin/node_exporter&lt;/p&gt;

&lt;p&gt;[Install]&lt;br&gt;
WantedBy=multi-user.target&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT SRE ENGINEERS MUST KNOW

## systemd

Linux service manager.

Controls:

* starting services
* stopping services
* restarting services
* logs
* boot startup

---

# SAVE FILE

Press:



```text id="p5agkm"
CTRL + X
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Then:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="l0t7zv"&lt;br&gt;
Y&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Then:



```text id="v5zzdn"
ENTER
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 12 — START NODE EXPORTER
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="68l7yd"&lt;br&gt;
sudo systemctl daemon-reload&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Meaning:



```text id="g10qeo"
Reload systemd configs.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;




&lt;p&gt;Enable startup:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="5qihkg"&lt;br&gt;
sudo systemctl enable node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Start service:



```bash id="f6h6gw"
sudo systemctl start node_exporter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 13 — VERIFY SERVICE
&lt;/h1&gt;



&lt;p&gt;```bash id="31d9g8"&lt;br&gt;
sudo systemctl status node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="2ww4vc"
active (running)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 14 — CHECK PORT
&lt;/h1&gt;

&lt;p&gt;Run:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="h44jl6"&lt;br&gt;
ss -tulnp | grep 9100&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="mty7sf"
LISTEN
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  SRE UNDERSTANDING
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Command&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;ss&lt;/td&gt;
&lt;td&gt;socket statistics&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;-t&lt;/td&gt;
&lt;td&gt;TCP&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;-u&lt;/td&gt;
&lt;td&gt;UDP&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;-l&lt;/td&gt;
&lt;td&gt;listening&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;-n&lt;/td&gt;
&lt;td&gt;numeric&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;-p&lt;/td&gt;
&lt;td&gt;process&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  STEP 15 — OPEN SECURITY GROUP
&lt;/h1&gt;

&lt;p&gt;Go back to AWS.&lt;/p&gt;


&lt;h2&gt;
  
  
  Click:
&lt;/h2&gt;



&lt;p&gt;```text id="x3xv11"&lt;br&gt;
EC2&lt;br&gt;
→ Instances&lt;br&gt;
→ Select instance&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

## Bottom Tab

Click:



```text id="61jmtq"
Security
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Click Security Group
&lt;/h2&gt;

&lt;p&gt;Under:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="c4g9nr"&lt;br&gt;
Security groups&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


click the SG.

---

# STEP 16 — EDIT INBOUND RULES

Click:



```text id="m8n6d7"
Edit inbound rules
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Add:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Port&lt;/th&gt;
&lt;th&gt;Source&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Custom TCP&lt;/td&gt;
&lt;td&gt;3000&lt;/td&gt;
&lt;td&gt;0.0.0.0/0&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Custom TCP&lt;/td&gt;
&lt;td&gt;9090&lt;/td&gt;
&lt;td&gt;0.0.0.0/0&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Custom TCP&lt;/td&gt;
&lt;td&gt;9100&lt;/td&gt;
&lt;td&gt;0.0.0.0/0&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="d7fw5v"&lt;br&gt;
Save rules&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT SRE ENGINEERS MUST KNOW

| Port | Service       |
| ---- | ------------- |
| 3000 | Grafana       |
| 9090 | Prometheus    |
| 9100 | Node Exporter |

---

# STEP 17 — TEST NODE EXPORTER

Browser:



```text id="m8db1x"
http://YOUR_PUBLIC_IP:9100/metrics
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see THOUSANDS of metrics.&lt;/p&gt;


&lt;h1&gt;
  
  
  IMPORTANT SRE CONCEPT
&lt;/h1&gt;

&lt;p&gt;Metrics format:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="dcew0u"&lt;br&gt;
metric_name value&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Example:



```text id="3b1fmb"
node_cpu_seconds_total
node_memory_MemAvailable_bytes
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 18 — CONFIGURE PROMETHEUS
&lt;/h1&gt;

&lt;p&gt;Terminal:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="2d3a4m"&lt;br&gt;
sudo nano /etc/prometheus/prometheus.yml&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Find:



```yaml id="m1cljx"
scrape_configs:
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Add:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```yaml id="mnjgx0"&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;job_name: "node_exporter"&lt;/p&gt;

&lt;p&gt;static_configs:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;targets: ["localhost:9100"]
```
&lt;/li&gt;
&lt;/ul&gt;


&lt;/li&gt;

&lt;/ul&gt;




&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST KNOW
&lt;/h1&gt;

&lt;h2&gt;
  
  
  scrape_configs
&lt;/h2&gt;

&lt;p&gt;Defines WHAT Prometheus monitors.&lt;/p&gt;




&lt;h2&gt;
  
  
  targets
&lt;/h2&gt;

&lt;p&gt;Defines WHERE metrics exist.&lt;/p&gt;




&lt;h1&gt;
  
  
  STEP 19 — RESTART PROMETHEUS
&lt;/h1&gt;



&lt;p&gt;```bash id="zv1rxn"&lt;br&gt;
sudo systemctl restart prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Verify:



```bash id="2t6x4x"
sudo systemctl status prometheus
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 20 — CHECK TARGETS
&lt;/h1&gt;

&lt;p&gt;Browser:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="4n0h5v"&lt;br&gt;
&lt;a href="http://YOUR_PUBLIC_IP:9090/targets" rel="noopener noreferrer"&gt;http://YOUR_PUBLIC_IP:9090/targets&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="ajut3e"
node_exporter UP
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  WHAT DOES UP MEAN?
&lt;/h1&gt;

&lt;p&gt;Prometheus successfully scraped metrics.&lt;/p&gt;


&lt;h1&gt;
  
  
  IF DOWN?
&lt;/h1&gt;

&lt;p&gt;Possible reasons:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Problem&lt;/th&gt;
&lt;th&gt;Meaning&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Wrong IP&lt;/td&gt;
&lt;td&gt;Incorrect target&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Firewall&lt;/td&gt;
&lt;td&gt;Port blocked&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;SG&lt;/td&gt;
&lt;td&gt;AWS blocked&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Service stopped&lt;/td&gt;
&lt;td&gt;Exporter down&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Wrong port&lt;/td&gt;
&lt;td&gt;Misconfiguration&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  STEP 21 — OPEN GRAFANA
&lt;/h1&gt;

&lt;p&gt;Browser:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="3m90yo"&lt;br&gt;
&lt;a href="http://YOUR_PUBLIC_IP:3000" rel="noopener noreferrer"&gt;http://YOUR_PUBLIC_IP:3000&lt;/a&gt;&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Login:



```text id="e8lqq9"
admin
admin
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Change password.&lt;/p&gt;


&lt;h1&gt;
  
  
  STEP 22 — ADD PROMETHEUS DATASOURCE
&lt;/h1&gt;

&lt;p&gt;Left menu:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="ez94oq"&lt;br&gt;
Connections&lt;br&gt;
→ Data Sources&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Click:



```text id="3bupw0"
Add data source
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Choose:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="1zj5df"&lt;br&gt;
Prometheus&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# STEP 23 — CONFIGURE DATASOURCE

URL:



```text id="th62k4"
http://localhost:9090
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Scroll down.&lt;/p&gt;

&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="u9jx3o"&lt;br&gt;
Save &amp;amp; Test&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


You should see:



```text id="a7n1s8"
Data source is working
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  STEP 24 — IMPORT DASHBOARD
&lt;/h1&gt;

&lt;p&gt;Left menu:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="d8g2gs"&lt;br&gt;
Dashboards&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Click:



```text id="7vvjlwm"
Import
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Dashboard ID:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="vh7q0h"&lt;br&gt;
1860&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Click:



```text id="fhp88p"
Load
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Choose datasource.&lt;/p&gt;

&lt;p&gt;Click:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="0xyvdo"&lt;br&gt;
Import&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# NOW YOU WILL SEE

Real-time:

* CPU
* Memory
* Disk
* Filesystem
* Network
* Load Average
* Processes

---

# STEP 25 — GENERATE CPU LOAD

Now act like SRE engineer.

Install stress tool:



```bash id="zwukbd"
sudo apt install stress -y
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Generate load:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="dvv1tb"&lt;br&gt;
stress --cpu 2 --timeout 60&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# WHAT HAPPENS?



```text id="1ux1a8"
CPU usage increases.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;Go to Grafana dashboard.&lt;/p&gt;

&lt;p&gt;Watch graphs move LIVE.&lt;/p&gt;

&lt;p&gt;THIS is real observability.&lt;/p&gt;


&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS ANALYZE
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Metric&lt;/th&gt;
&lt;th&gt;Meaning&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;CPU&lt;/td&gt;
&lt;td&gt;Processing usage&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Memory&lt;/td&gt;
&lt;td&gt;RAM consumption&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Disk IO&lt;/td&gt;
&lt;td&gt;Read/write operations&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Network&lt;/td&gt;
&lt;td&gt;Traffic&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Load Average&lt;/td&gt;
&lt;td&gt;System pressure&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Filesystem&lt;/td&gt;
&lt;td&gt;Disk usage&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  STEP 26 — BREAK THINGS
&lt;/h1&gt;

&lt;p&gt;Stop exporter:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="yu8y1g"&lt;br&gt;
sudo systemctl stop node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


Go to:



```text id="3o8l4f"
http://YOUR_PUBLIC_IP:9090/targets
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;You should see:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="n72nci"&lt;br&gt;
DOWN&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

# THIS IS REAL SRE TROUBLESHOOTING

SRE engineers always ask:



```text id="o4njzi"
Why is target DOWN?
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  TROUBLESHOOTING FLOW
&lt;/h1&gt;
&lt;h2&gt;
  
  
  Step 1
&lt;/h2&gt;

&lt;p&gt;Check service:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="xen6ko"&lt;br&gt;
sudo systemctl status node_exporter&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

## Step 2

Check listening port:



```bash id="s4rfp0"
ss -tulnp | grep 9100
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Step 3
&lt;/h2&gt;

&lt;p&gt;Check metrics endpoint:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```bash id="8bjv4r"&lt;br&gt;
curl localhost:9100/metrics&lt;/p&gt;
&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


---

## Step 4

Check Prometheus logs:



```bash id="eotnlc"
journalctl -u prometheus -f
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h1&gt;
  
  
  WHAT SRE ENGINEERS MUST MEMORIZE
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;systemctl&lt;/td&gt;
&lt;td&gt;Service management&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;ss&lt;/td&gt;
&lt;td&gt;Check ports&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;curl&lt;/td&gt;
&lt;td&gt;Test endpoint&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;journalctl&lt;/td&gt;
&lt;td&gt;Logs&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;top&lt;/td&gt;
&lt;td&gt;CPU&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;free -m&lt;/td&gt;
&lt;td&gt;Memory&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;df -h&lt;/td&gt;
&lt;td&gt;Disk&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  MOST IMPORTANT INTERVIEW QUESTIONS
&lt;/h1&gt;


&lt;h1&gt;
  
  
  What is Node Exporter?
&lt;/h1&gt;

&lt;p&gt;Exports Linux system metrics for Prometheus.&lt;/p&gt;


&lt;h1&gt;
  
  
  Why use exporters?
&lt;/h1&gt;

&lt;p&gt;Prometheus cannot directly understand Linux metrics.&lt;/p&gt;


&lt;h1&gt;
  
  
  Why is Prometheus pull based?
&lt;/h1&gt;

&lt;p&gt;Prometheus periodically scrapes targets.&lt;/p&gt;

&lt;p&gt;Advantages:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;centralized&lt;/li&gt;
&lt;li&gt;easier troubleshooting&lt;/li&gt;
&lt;li&gt;better reliability&lt;/li&gt;
&lt;li&gt;service discovery support&lt;/li&gt;
&lt;/ul&gt;


&lt;h1&gt;
  
  
  Difference Between Grafana and Prometheus?
&lt;/h1&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Prometheus&lt;/td&gt;
&lt;td&gt;stores metrics&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Grafana&lt;/td&gt;
&lt;td&gt;visualizes metrics&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h1&gt;
  
  
  What is a target?
&lt;/h1&gt;

&lt;p&gt;A monitored endpoint.&lt;/p&gt;

&lt;p&gt;Example:&lt;br&gt;
&lt;/p&gt;

&lt;p&gt;```text id="m34fbe"&lt;br&gt;
localhost:9100&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;


&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
    </item>
  </channel>
</rss>
