K8 Monitoring

#monitoring #automation #kubernetes #devops

#!/bin/bash
# =============================================================
# Cronjob_monitorpod.sh
# Manually trigger a CronJob, track the newest pod for its Job,
# collect pod logs + periodic CPU/MEM. Additionally, whenever a
# log line contains "/app/ShieldCache", we take an immediate
# CPU/MEM snapshot and append it to the SAME RESOURCE_CSV.
# =============================================================

set -euo pipefail

# --------- EDIT THESE ----------
NAMESPACE="onprem"          # namespace where the CronJob lives
CRONJOB_NAME="teams-chat"   # CronJob to trigger
JOB_NAME_BASE="teams-chat"  # base Job name; script adds timestamp
# -------------------------------

# Paths
SCRIPT="$(realpath -s "$0")"
SCRIPTPATH="$(dirname "$SCRIPT")"

# Timestamps: one safe for K8s names (hyphens), one for files (underscores OK)
TIMESTAMP_K8S="$(date +'%Y%m%d-%H%M%S')"    # safe for k8s object names
TIMESTAMP_FILE="$(date +'%Y%m%d_%H%M%S')"   # file-friendly

# RFC1123-safe Job name
JOB_NAME="$(echo "${JOB_NAME_BASE}-${TIMESTAMP_K8S}" | tr '[:upper:]_' '[:lower:]-')"

# Ensure both typical casings exist; choose one canonical
mkdir -p "$SCRIPTPATH/Logs" "$SCRIPTPATH/Lastlog" >/dev/null 2>&1 || true
LOG_DIR="$SCRIPTPATH/Logs"
LASTLOG_DIR="$SCRIPTPATH/Lastlog"

# Files (only these two will be written)
LOG_PATH="$LOG_DIR/${TIMESTAMP_FILE}-${JOB_NAME_BASE}.log"
RESOURCE_CSV="$LOG_DIR/${TIMESTAMP_FILE}-${JOB_NAME_BASE}_pod_resources.csv"

# Keyword checks (customize)
ERROR_KEYWORDS=("java.io.EOFException" "NoSuchMethodError" "CrashLoopBackOff" "Error")
SUCCESS_KEYWORDS=("TGM Policy Ended" "Job completed" "Completed successfully")

ts(){ date +'%Y-%m-%d %H:%M:%S'; }

echo "$(ts) [INFO] Logs: $LOG_PATH"
echo "$(ts) [INFO] Namespace: $NAMESPACE | CronJob: $CRONJOB_NAME | Job: $JOB_NAME" | tee -a "$LOG_PATH"

# --- Validate namespace ---
if ! kubectl get namespace "$NAMESPACE" >/dev/null 2>&1; then
  echo "$(ts) [ERROR] Namespace '$NAMESPACE' not found." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# --- Create Job from CronJob ---
echo "$(ts) [INFO] Creating Job '$JOB_NAME' from CronJob '$CRONJOB_NAME'..." | tee -a "$LOG_PATH"
if ! kubectl -n "$NAMESPACE" create job --from=cronjob/"$CRONJOB_NAME" "$JOB_NAME" >/dev/null 2>&1; then
  echo "$(ts) [ERROR] Failed to create Job from CronJob (name collision? RBAC?)." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# --- Find newest pod for this Job ---
get_newest_pod() {
  kubectl -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \
    --sort-by=.metadata.creationTimestamp \
    -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | tail -n 1
}

echo "$(ts) [INFO] Waiting for pod for Job '$JOB_NAME'..." | tee -a "$LOG_PATH"
POD_NAME=""
while [[ -z "${POD_NAME}" ]]; do
  POD_NAME="$(get_newest_pod)"
  [[ -z "$POD_NAME" ]] && { echo "$(ts) [INFO] Not yet created… retry in 10s"; sleep 10; }
done
NODE_NAME="$(kubectl -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
echo "$(ts) [INFO] Pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" | tee -a "$LOG_PATH"

# --- Prepare CSV (same file for periodic samples AND ShieldCache hits) ---
echo "timestamp,pod,cpu,mem" > "$RESOURCE_CSV"

# --- Background watcher: on /app/ShieldCache lines, append a usage row to RESOURCE_CSV ---
kubectl -n "$NAMESPACE" logs -f "pod/${POD_NAME}" --timestamps=true 2>/dev/null | \
while IFS= read -r line; do
  if echo "$line" | grep -q "/app/ShieldCache"; then
    now="$(ts)"
    usage="$(kubectl -n "$NAMESPACE" top pod "$POD_NAME" --no-headers 2>/dev/null | awk '{print $2","$3}')"
    echo "$now,$POD_NAME,${usage:-,}" >> "$RESOURCE_CSV"
    echo "$(ts) [INFO] ShieldCache activity detected; sampled usage: ${usage:-N/A}" >> "$LOG_PATH"
  fi
done &
CACHE_MON_PID=$!

cleanup(){ kill "$CACHE_MON_PID" >/dev/null 2>&1 || true; cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true; }
trap cleanup EXIT

# --- Main monitoring loop: periodic CPU/MEM and phase ---
echo "$(ts) [INFO] Monitoring pod resource usage… (to $RESOURCE_CSV)" | tee -a "$LOG_PATH"

while true; do
  # Periodic sample (every loop)
  line="$(kubectl -n "$NAMESPACE" top pod "$POD_NAME" --no-headers 2>/dev/null | awk -v t="$(ts)" '{print t","$1","$2","$3}')"
  [[ -n "$line" ]] && echo "$line" >> "$RESOURCE_CSV"

  phase="$(kubectl -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")"
  case "$phase" in
    Running|Pending)
      echo "$(ts) [INFO] Pod $POD_NAME is $phase. Sleeping 30s…" | tee -a "$LOG_PATH"
      sleep 30
      ;;
    Succeeded|Completed)
      echo "$(ts) [INFO] Pod $POD_NAME finished with phase: $phase." | tee -a "$LOG_PATH"
      break
      ;;
    Failed|Error)
      echo "$(ts) [ERROR] Pod $POD_NAME ended with phase: $phase." | tee -a "$LOG_PATH"
      exit 2
      ;;
    *)
      echo "$(ts) [WARN] Pod $POD_NAME phase '$phase' (waiting 20s)..." | tee -a "$LOG_PATH"
      sleep 20
      ;;
  esac
done

# --- Fetch final logs once ---
echo "$(ts) [INFO] Fetching final logs for pod $POD_NAME…" | tee -a "$LOG_PATH"
kubectl -n "$NAMESPACE" logs "pod/${POD_NAME}" --timestamps=true >> "$LOG_PATH" 2>&1 || true

# --- Keyword scans ---
for kw in "${ERROR_KEYWORDS[@]}"; do
  if grep -qi -- "$kw" "$LOG_PATH"; then
    echo "$(ts) [ERROR] Error keyword found in logs: '$kw'." | tee -a "$LOG_PATH"
    exit 3
  fi
done
for kw in "${SUCCESS_KEYWORDS[@]}"; do
  if ! grep -qi -- "$kw" "$LOG_PATH"; then
    echo "$(ts) [WARN] Success keyword '$kw' not found in logs." | tee -a "$LOG_PATH"
  fi
done

echo "$(ts) [INFO] Done. Files:"
echo "  - $LOG_PATH"
echo "  - $RESOURCE_CSV"
exit 0

updated

#!/bin/bash

# Define variables
SCRIPT="$(realpath -s "$0")"
SCRIPTPATH="$(dirname "$SCRIPT")"

NAMESPACE="onprem"                      # Namespace where the pods are located
CRONJOB_NAME="teams-chat"               # Name of the CronJob you want to trigger the job from
JOB_NAME="teams-chat"                   # Fixed job name
TIMESTAMP="$(date +'%Y%m%d_%H%M%S')"    # Timestamp for log file
LOG_PATH="$SCRIPTPATH/Logs/${TIMESTAMP}-${JOB_NAME}.log"   # Log file with timestamp
RESOURCE_CSV="$SCRIPTPATH/Logs/${TIMESTAMP}-${JOB_NAME}_pod_resources.csv"

# List of keywords to check in the logs (customize as needed)
ERROR_KEYWORDS=("java.io.EOFException" "NoSuchMethodError")
SUCCESS_KEYWORDS=("TGM Policy Ended")

DAY_OF_WEEK="$(date +%A)"

# Script help
print_help() {
  clear
  echo "---------------------------------------------------------------"
  echo "Script is used to monitor given POD resources (Memory and CPU)."
  echo " "
  echo "It must be run with ONE mandatory parameter - name of POD to monitor."
  echo "Usage: $0 <pod-name>"
  echo "Example:"
  echo "$0 smtp-email-1-29002137-fwtqx"
  echo "Use \"kubectl get pods\" command to list available PODs."
  echo "---------------------------------------------------------------"
  exit 0
}

# Day gating as in your screenshots
if [[ "$DAY_OF_WEEK" != "Sunday" && "$DAY_OF_WEEK" != "Monday" ]]; then
  echo "Today is $DAY_OF_WEEK. Starting import job..."
else
  echo "Today is $DAY_OF_WEEK. Import job will be skipped as there is no delivery."
  exit 0
fi

# 1. Validate the namespace exists
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] More detailed logs can be found in $LOG_PATH"
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error: Namespace $NAMESPACE does not exist." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# 2. Create the job manually from the CronJob
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Creating job $JOB_NAME from CronJob $CRONJOB_NAME in namespace $NAMESPACE..." | tee -a "$LOG_PATH"
kubectl create job --from=cronjob/"$CRONJOB_NAME" -n "$NAMESPACE" "$JOB_NAME"
if [[ $? -ne 0 ]]; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error: Failed to create job from CronJob." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# 3. Wait for the pod associated with the job to be created
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Waiting for pod associated with job $JOB_NAME to be created..." | tee -a "$LOG_PATH"
while true
do
  POD_NAME="$(kubectl get pods -n "$NAMESPACE" -l job-name="$JOB_NAME" --no-headers | awk '{print $1}' | head -n 1)"
  if [[ -n "$POD_NAME" ]]; then
    NODE_NAME="$(kubectl get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.nodeName}')"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME associated with job $JOB_NAME has been created on node $NODE_NAME." | tee -a "$LOG_PATH"
    break
  else
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod for job $JOB_NAME not yet created... Retrying in 10 seconds." | tee -a "$LOG_PATH"
    sleep 10
  fi
done

# 4. Prepare resource usage CSV file for the triggered pod
echo "ts,pod,cpu,mem" > "$RESOURCE_CSV"

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# (ADDED) ShieldCache correlator:
# Start a minimal background watcher that tails the pod logs and whenever
# a line contains "/app/ShieldCache", it takes a one-off CPU/MEM sample
# and appends it to the SAME RESOURCE_CSV in the same 4-column format.
kubectl logs -f "$POD_NAME" -n "$NAMESPACE" --timestamps=true 2>/dev/null | \
while IFS= read -r line; do
  if echo "$line" | grep -q "/app/ShieldCache"; then
    ts_now="$(date +'%Y-%m-%d %H:%M:%S')"
    usage_line="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk '{print $2","$3}')"
    echo "$ts_now,$POD_NAME,${usage_line:-,}" >> "$RESOURCE_CSV"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] ShieldCache activity detected; sampled usage appended to CSV." >> "$LOG_PATH"
  fi
done &
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# 5. Wait for the pod to complete (success or failure) and check for "Error" state
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Waiting for pod $POD_NAME to complete..." | tee -a "$LOG_PATH"
while true; do
  # Collect resource usage for the pod every 30 seconds
  USAGE_LINE="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" '{print ts","$1","$2","$3}')"
  if [[ -n "$USAGE_LINE" ]]; then
    echo "$USAGE_LINE" >> "$RESOURCE_CSV"
  fi

  POD_STATUS="$(kubectl get pods -n "$NAMESPACE" --no-headers | awk -v pod="$POD_NAME" '$1==pod {print $3}')"

  if [[ "$POD_STATUS" == "Completed" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME status: $POD_STATUS." | tee -a "$LOG_PATH"
    break
  elif [[ "$POD_STATUS" == "Error" || "$POD_STATUS" == "CrashLoopBackOff" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Pod $POD_NAME status: $POD_STATUS." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Creating timestamps-true.txt for pod $POD_NAME." | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') namespace=$NAMESPACE" >> "$SCRIPTPATH/Lastlog/timestamps-true.txt"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    exit 2
  elif [[ "$POD_STATUS" == "Running" || "$POD_STATUS" == "NotReady" || "$POD_STATUS" == "Init" || "$POD_STATUS" == "PodInitializing" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Something went wrong? pod status=$POD_STATUS..." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true | tee -a "$LOG_PATH"
    tail -n 100 "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME is still running ($POD_STATUS). Waiting 30 seconds." | tee -a "$LOG_PATH"
    sleep 30
  else
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    if ! kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true >> "$LOG_PATH" 2>&1; then
      echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Failed to fetch logs for pod $POD_NAME." | tee -a "$LOG_PATH"
      cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
      exit 3
    fi
    break
  fi
done

# 6. Fetch logs from the pod (final)
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true | tee -a "$LOG_PATH"
if [[ $? -ne 0 ]]; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Failed to fetch logs for pod $POD_NAME." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 3
fi

# 7. Grep the logs for any error-related keywords
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Checking logs for errors or exceptions..." | tee -a "$LOG_PATH"
for keyword in "${ERROR_KEYWORDS[@]}"; do
  if grep -iq "$keyword" "$LOG_PATH"; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error found in logs: Keyword '$keyword' found. Exiting job as failed." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    tail -n 500 "$LOG_PATH"
    exit 4
  fi
done

# 8. Grep the logs for success-related keywords and ensure none are missing
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Checking logs for success-related keywords..." | tee -a "$LOG_PATH"
for keyword in "${SUCCESS_KEYWORDS[@]}"; do
  if ! grep -iq "$keyword" "$LOG_PATH"; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Failure: Keyword '$keyword' not found in logs. Exiting job as failed." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    tail -n 500 "$LOG_PATH"
    exit 5
  fi
done

# 9. Output the last 500 lines of the log file to the terminal (execution agent)
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Displaying the last 500 lines of the log file:" | tee -a "$LOG_PATH"
tail -n 500 "$LOG_PATH"
cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true

# 10. Job completed successfully
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Job $JOB_NAME completed successfully." | tee -a "$LOG_PATH"
exit 0

DEV Community

K8 Monitoring

updated

Top comments (0)