DEV Community

ByteLedger
ByteLedger

Posted on

Monitor k8 pods


#!/bin/bash

# Define variables
SCRIPT="$(realpath -s "$0")"
SCRIPTPATH="$(dirname "$SCRIPT")"

NAMESPACE="onprem"                      # Namespace where the pods are located
CRONJOB_NAME="teams-chat"               # Name of the CronJob you want to trigger the job from
JOB_NAME="teams-chat"                   # Fixed job name
TIMESTAMP="$(date +'%Y%m%d_%H%M%S')"    # Timestamp for log file

LOG_PATH="$SCRIPTPATH/Logs/${TIMESTAMP}-${JOB_NAME}.log"                 # Log file with timestamp
RESOURCE_CSV="$SCRIPTPATH/Logs/${TIMESTAMP}-${JOB_NAME}_pod_resources.csv"

# List of keywords to check in the logs (customize as needed)
ERROR_KEYWORDS=("java.io.EOFException" "NoSuchMethodError")
SUCCESS_KEYWORDS=("TGM Policy Ended")

DAY_OF_WEEK="$(date +%A)"

# Day gating (kept exactly as in your script)
if [[ "$DAY_OF_WEEK" != "Sunday" && "$DAY_OF_WEEK" != "Monday" ]]; then
  echo "Today is $DAY_OF_WEEK. Starting import job..."
else
  echo "Today is $DAY_OF_WEEK. Import job will be skipped as there is no delivery."
  exit 0
fi

# 1. Validate the namespace exists
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] More detailed logs can be found in $LOG_PATH"
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error: Namespace $NAMESPACE does not exist." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# 2. Create the job manually from the CronJob
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Creating job $JOB_NAME from CronJob $CRONJOB_NAME in namespace $NAMESPACE..." | tee -a "$LOG_PATH"
kubectl create job --from=cronjob/"$CRONJOB_NAME" -n "$NAMESPACE" "$JOB_NAME"
if [[ $? -ne 0 ]]; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error: Failed to create job from CronJob." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 1
fi

# 3. Wait for the pod associated with the job to be created
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Waiting for pod associated with job $JOB_NAME to be created..." | tee -a "$LOG_PATH"
while true
do
  POD_NAME="$(kubectl get pods -n "$NAMESPACE" -l job-name="$JOB_NAME" --no-headers | awk '{print $1}' | head -n 1)"
  if [[ -n "$POD_NAME" ]]; then
    NODE_NAME="$(kubectl get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.nodeName}')"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME associated with job $JOB_NAME has been created on node $NODE_NAME." | tee -a "$LOG_PATH"
    break
  else
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod for job $JOB_NAME not yet created... Retrying in 10 seconds." | tee -a "$LOG_PATH"
    sleep 10
  fi
done

# 4. Prepare resource usage CSV file for the triggered pod
echo "ts,pod,cpu,mem" > "$RESOURCE_CSV"

# ----------------------------------------------------------------------
# ADDED: ShieldCache correlator (only change)
# Tails pod logs; on lines containing '/app/ShieldCache', snapshot CPU/MEM
# and append a row to the SAME RESOURCE_CSV (ts,pod,cpu,mem)
kubectl logs -f "$POD_NAME" -n "$NAMESPACE" --timestamps=true 2>/dev/null | \
while IFS= read -r line; do
  if echo "$line" | grep -q "/app/ShieldCache"; then
    ts_now="$(date +'%Y-%m-%d %H:%M:%S')"
    usage_line="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk '{print $2","$3}')"
    echo "$ts_now,$POD_NAME,${usage_line:-,}" >> "$RESOURCE_CSV"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] ShieldCache activity detected; sampled usage appended to CSV." >> "$LOG_PATH"
  fi
done &
# ----------------------------------------------------------------------

# 5. Wait for the pod to complete (success or failure) and check for "Error" state
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Waiting for pod $POD_NAME to complete..." | tee -a "$LOG_PATH"
while true; do
  # Collect resource usage for the pod every 30 seconds
  USAGE_LINE="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" '{print ts","$1","$2","$3}')"
  if [[ -n "$USAGE_LINE" ]]; then
    echo "$USAGE_LINE" >> "$RESOURCE_CSV"
  fi

  POD_STATUS="$(kubectl get pods -n "$NAMESPACE" --no-headers | awk -v pod="$POD_NAME" '$1==pod {print $3}')"

  if [[ "$POD_STATUS" == "Completed" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME status: $POD_STATUS." | tee -a "$LOG_PATH"
    break
  elif [[ "$POD_STATUS" == "Error" || "$POD_STATUS" == "CrashLoopBackOff" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Pod $POD_NAME status: $POD_STATUS." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Creating timestamps-true.txt for pod $POD_NAME." | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') namespace=$NAMESPACE" >> "$SCRIPTPATH/Lastlog/timestamps-true.txt"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    exit 2
  elif [[ "$POD_STATUS" == "Running" || "$POD_STATUS" == "NotReady" || "$POD_STATUS" == "Init" || "$POD_STATUS" == "PodInitializing" ]]; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Something went wrong? pod status=$POD_STATUS..." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true | tee -a "$LOG_PATH"
    tail -n 100 "$LOG_PATH"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod $POD_NAME is still running ($POD_STATUS). Waiting 30 seconds." | tee -a "$LOG_PATH"
    sleep 30
  else
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    if ! kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true >> "$LOG_PATH" 2>&1; then
      echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Failed to fetch logs for pod $POD_NAME." | tee -a "$LOG_PATH"
      cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
      exit 3
    fi
    break
  fi
done

# 6. Fetch logs from the pod
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true | tee -a "$LOG_PATH"
if [[ $? -ne 0 ]]; then
  echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Failed to fetch logs for pod $POD_NAME." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
  exit 3
fi

# 7. Grep the logs for any error-related keywords
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Checking logs for errors or exceptions..." | tee -a "$LOG_PATH"
for keyword in "${ERROR_KEYWORDS[@]}"; do
  if grep -iq "$keyword" "$LOG_PATH"; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Error found in logs: Keyword '$keyword' found. Exiting job as failed." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    tail -n 500 "$LOG_PATH"
    exit 4
  fi
done

# 8. Grep the logs for success-related keywords and ensure none are missing
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Checking logs for success-related keywords..." | tee -a "$LOG_PATH"
for keyword in "${SUCCESS_KEYWORDS[@]}"; do
  if ! grep -iq "$keyword" "$LOG_PATH"; then
    echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Failure: Keyword '$keyword' not found in logs. Exiting job as failed." | tee -a "$LOG_PATH"
    # /bin/bash "$SCRIPTPATH/send_SMS_alert.sh" | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true
    tail -n 500 "$LOG_PATH"
    exit 5
  fi
done

# 9. Output the last 500 lines of the log file to the terminal (execution agent)
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Displaying the last 500 lines of the log file:" | tee -a "$LOG_PATH"
tail -n 500 "$LOG_PATH"
cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt" 2>/dev/null || true

# 10. Job completed successfully
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Job $JOB_NAME completed successfully." | tee -a "$LOG_PATH"
exit 0

Enter fullscreen mode Exit fullscreen mode

Top comments (0)