Monitor k8 pods

#monitoring #automation #kubernetes #devops


# 3a. Prepare resource usage CSV file for the triggered pod
OUTPUT_DIR="$SCRIPTPATH/output"
mkdir -p "$OUTPUT_DIR"
RESOURCE_CSV="$OUTPUT_DIR/${JOB_NAME}_pod_resources.csv"
echo "podname,cpu_usage,memory_usage,timestamp" > "$RESOURCE_CSV"

# >>> BEGIN add ShieldCache df CSV (minimal)
SC_DF_CSV="$OUTPUT_DIR/${JOB_NAME}_shieldcache_df.csv"
echo "timestamp,pod,filesystem,size,used,avail,use_pct,mountpoint" > "$SC_DF_CSV"
# <<< END add ShieldCache df CSV (minimal)


while true; do
  USAGE_LINE=$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | \
               awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" '{print $1","$2","$3","ts}')
  if [ -n "$USAGE_LINE" ]; then
    echo "$USAGE_LINE" >> "$RESOURCE_CSV"
  fi

  # >>> BEGIN add ShieldCache df sample (minimal)
  DF_LINE=$(kubectl -n "$NAMESPACE" exec "$POD_NAME" -- sh -c 'df -hP /app/ShieldCache | tail -n +2' 2>/dev/null | \
            awk -v ts="$(date +"%Y-%m-%d %H:%M:%S")" -v pod="$POD_NAME" '{printf "%s,%s,%s,%s,%s,%s,%s,%s\n", ts,pod,$1,$2,$3,$4,$5,$6}')
  if [ -n "$DF_LINE" ]; then
    echo "$DF_LINE" >> "$SC_DF_CSV"
  fi
  # <<< END add ShieldCache df sample (minimal)

Updated


#!/bin/bash

# Define variables
SCRIPT=$(realpath -s "$0")
SCRIPTPATH=$(dirname "$SCRIPT")
NAMESPACE="onprem"                        # Namespace where the pods are located
CRONJOB_NAME="teams-chat"                 # Name of the CronJob you want to trigger the job from
JOB_NAME="teams-chat-$(date +'%Y%m%d-%H%M%S')"  # Fixed job name
TIMESTAMP=$(date +'%Y-%m-%d_%H%M%S')      # Timestamp for log file
LOG_PATH="$SCRIPTPATH/Logs/${TIMESTAMP}-${JOB_NAME}.log" # Log file with timestamp
# List of keywords to check in the logs (customize as needed)
ERROR_KEYWORDS=("java.io.EOFException" "NoSuchMethodError")
SUCCESS_KEYWORDS=("TGM Policy Ended")
DAY_OF_WEEK=$(date +%A)

if [[ "$DAY_OF_WEEK" != "Sunday" && "$DAY_OF_WEEK" != "Monday" ]]; then
  echo "Today is $DAY_OF_WEEK. Starting import job..."
else
  echo "Today is $DAY_OF_WEEK. Import job will be skipped as there is no delivery."
  exit 0
fi

# 1. Validate the namespace exists
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] More detailed logs can be found in $LOG_PATH" | tee -a "$LOG_PATH"
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
  echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] Error: Namespace $NAMESPACE does not exist." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
  exit 1
fi

# 2. Create the job manually from the CronJob
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Creating job $JOB_NAME from CronJob $CRONJOB_NAME in namespace $NAMESPACE..." | tee -a "$LOG_PATH"
kubectl create job --from=cronjob/$CRONJOB_NAME -n "$NAMESPACE" "$JOB_NAME"
if [[ $? -ne 0 ]]; then
  echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] Error: Failed to create job from CronJob." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
  exit 1
fi

# 3. Wait for the pod associated with the job to be created
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Waiting for pod associated with job $JOB_NAME to be created..." | tee -a "$LOG_PATH"
while true; do
  POD_NAME=$(kubectl get pods -n "$NAMESPACE" -l job-name="$JOB_NAME" --no-headers | awk '{print $1}' | head -n 1)
  if [ -n "$POD_NAME" ]; then
    NODE_NAME=$(kubectl get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.nodeName}')
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Pod $POD_NAME associated with job $JOB_NAME has been created on node $NODE_NAME" | tee -a "$LOG_PATH"
    break
  fi
  echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Pod for job $JOB_NAME not yet created... Retrying in 10 seconds." | tee -a "$LOG_PATH"
  sleep 10
done

# 3a. Prepare resource usage CSV file for the triggered pod
OUTPUT_DIR="$SCRIPTPATH/output"
mkdir -p "$OUTPUT_DIR"
RESOURCE_CSV="$OUTPUT_DIR/${JOB_NAME}_pod_resources.csv"
echo "podname,cpu_usage,memory_usage,timestamp" > "$RESOURCE_CSV"

# >>> NEW (ShieldCache df): add a second CSV for /app/ShieldCache filesystem snapshots
SC_DF_CSV="$OUTPUT_DIR/${JOB_NAME}_shieldcache_df.csv"
echo "timestamp,pod,filesystem,size,used,avail,use_pct,mountpoint" > "$SC_DF_CSV"
# <<< NEW (ShieldCache df)

# 4. Wait for the pod to complete (success or failure) and check for "Error" state
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Waiting for pod $POD_NAME to complete..." | tee -a "$LOG_PATH"
while true; do
  # Collect resource usage for the pod every 30 seconds
  USAGE_LINE=$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" '{print $1","$2","$3","ts}')
  if [ -n "$USAGE_LINE" ]; then
    echo "$USAGE_LINE" >> "$RESOURCE_CSV"
  fi

  # >>> NEW (ShieldCache df): sample df -hP /app/ShieldCache into the second CSV
  DF_LINE=$(kubectl -n "$NAMESPACE" exec "$POD_NAME" -- sh -c "df -hP /app/ShieldCache | tail -n +2" 2>/dev/null | \
            awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" -v pod="$POD_NAME" '{printf "%s,%s,%s,%s,%s,%s,%s,%s\n", ts,pod,$1,$2,$3,$4,$5,$6}')
  if [ -n "$DF_LINE" ]; then
    echo "$DF_LINE" >> "$SC_DF_CSV"
  fi
  # <<< NEW (ShieldCache df)

  POD_STATUS=$(kubectl get pods -n "$NAMESPACE" --no-headers | awk -v pod="$POD_NAME" '$1==pod {print $3}')
  if [[ "$POD_STATUS" == "Completed" ]]; then
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Pod $POD_NAME is in $POD_STATUS status." | tee -a "$LOG_PATH"
    break
  elif [[ "$POD_STATUS" == "Error" || "$POD_STATUS" == "CrashLoopBackOff" ]]; then
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] Pod $POD_NAME is in status $POD_STATUS." | tee -a "$LOG_PATH"
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Calling external script to send a notification." | tee -a "$LOG_PATH"
    /bin/bash $SCRIPTPATH/send_SMS_alert.sh | tee -a "$LOG_PATH"
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    kubectl logs "$POD_NAME" --timestamps=true -n "$NAMESPACE" >> "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
    exit 2
  elif [[ "$POD_STATUS" == "Running" || "$POD_STATUS" == "NotReady" || "$POD_STATUS" == "Init" || "$POD_STATUS" == "PodInitializing" ]]; then
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Pod $POD_NAME is still running ($POD_STATUS)... Waiting 30 seconds." | tee -a "$LOG_PATH"
    sleep 30
  else
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
    kubectl logs "$POD_NAME" --timestamps=true -n "$NAMESPACE" >> "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
    exit 3
  fi
done

# 5. Fetch logs from the pod
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Fetching logs for pod $POD_NAME..." | tee -a "$LOG_PATH"
kubectl logs "$POD_NAME" --timestamps=true -n "$NAMESPACE" >> "$LOG_PATH"
if [ $? -ne 0 ]; then
  echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] Failed to fetch logs for pod $POD_NAME." | tee -a "$LOG_PATH"
  cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
  exit 3
fi

# 6. Grep the logs for any error-related keywords
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Checking logs for errors or exceptions..." | tee -a "$LOG_PATH"
for keyword in "${ERROR_KEYWORDS[@]}"; do
  if grep -iq "$keyword" "$LOG_PATH"; then
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] Error found in logs: Keyword '$keyword' found. Exiting job as failed." | tee -a "$LOG_PATH"
    /bin/bash $SCRIPTPATH/send_SMS_alert.sh | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
    tail -n 500 "$LOG_PATH"
    exit 4
  fi
done

# 7. Grep the logs for success-related keywords and ensure none are missing
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Checking logs for success-related keywords..." | tee -a "$LOG_PATH"
for keyword in "${SUCCESS_KEYWORDS[@]}"; do
  if ! grep -iq "$keyword" "$LOG_PATH"; then
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Failure: Keyword '$keyword' not found in logs. Exiting job as failed." | tee -a "$LOG_PATH"
    /bin/bash $SCRIPTPATH/send_SMS_alert.sh | tee -a "$LOG_PATH"
    cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"
    tail -n 500 "$LOG_PATH"
    exit 5
  fi
done

# 8. Output the last 500 lines of the log file to the terminal (execution agent)
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Displaying the last 500 lines of the log file:" | tee -a "$LOG_PATH"
tail -n 500 "$LOG_PATH"
cp "$LOG_PATH" "$SCRIPTPATH/Lastlog/lastpodlog.txt"

# 9. Job completed successfully
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] Job $JOB_NAME completed successfully." | tee -a "$LOG_PATH"
exit 0

DEV Community

Monitor k8 pods

Top comments (0)