DEV Community

ByteLedger
ByteLedger

Posted on • Edited on

Monitor pods in K8

#!/usr/bin/env bash
# monitor_pod_or_cronjob.sh
# Two modes:
#  A) CronJob mode (give -c/--cronjob): create a Job from a CronJob and follow newest pod.
#  B) Pod-pattern mode (give one of --pod-exact/--pod-prefix/--pod-regex): wait for a pod that matches and follow THAT pod precisely.
#
# Output: <script_dir>/output/<YYYYmmdd-HHMMSS>_<name>/{pod.log,resources.csv}
# Requirements: kubectl (metrics-server optional for CPU/mem numbers).

set -euo pipefail

# --------------------------
# Defaults (override by flags)
# --------------------------
NAMESPACE=""
CRONJOB_NAME=""         # if set => CronJob mode
JOB_PREFIX=""           # default => CRONJOB_NAME

POD_EXACT=""            # if set => Pod-pattern mode (exact match)
POD_PREFIX=""           # if set => Pod-pattern mode (prefix match)
POD_REGEX=""            # if set => Pod-pattern mode (regex via grep -E)
LOCK_ONCE="true"        # if true, never switch to another pod after locking; set --follow-newer=false to change

CONTAINER_NAME=""       # specific container to stream logs from
SAMPLE_INTERVAL=5       # seconds between resource samples
WAIT_TIMEOUT=900        # seconds to wait for pod to appear
KUBECTL="${KUBECTL:-kubectl}"
TUE_SAT_ONLY="false"

# --------------------------
# Internals
# --------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_ROOT="${SCRIPT_DIR}/output"
TIMESTAMP="$(date +'%Y%m%d-%H%M%S')"

ts()  { date +'%Y-%m-%d %H:%M:%S'; }
tsz() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }

usage() {
  cat <<EOF
Usage:

  # CronJob mode (create Job and monitor)
  ${0##*/} -n <ns> -c <cronjob> [--job-prefix PFX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=true|false]

  # Pod-pattern mode (do NOT create anything; monitor a pod by name pattern)
  ${0##*/} -n <ns> [--pod-exact NAME | --pod-prefix PREFIX | --pod-regex REGEX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=false]

Options:
  -n, --namespace NS         Namespace (required in both modes)
  -c, --cronjob NAME         CronJob name (CronJob mode)
  -j, --job-prefix PFX       Prefix for created Job (default: cronjob name)
      --pod-exact NAME       Exact pod name (Pod-pattern mode)
      --pod-prefix PREFIX    Pod name starts with PREFIX (Pod-pattern mode)
      --pod-regex REGEX      Pod name matches REGEX (Pod-pattern mode; grep -E)
      --container NAME       Container to stream logs (default: first / all)
      --interval SECS        Metrics sample interval (default: 5)
      --wait SECS            Timeout waiting for pod (default: 900)
      --follow-newer=BOOL    If true (default in CronJob mode), switch to newer pods when they appear.
                             In Pod-pattern mode default is true for prefix/regex, false for exact.
      --tue-sat-only         Skip on Sun/Mon
  -h, --help

Examples:
  # Monitor a specific pod by exact name:
  ${0##*/} -n data --pod-exact importer-20251012-81kdb --interval 2

  # Monitor the newest pod starting with a prefix:
  ${0##*/} -n data --pod-prefix importer- --interval 5 --follow-newer=false

  # Monitor by regex:
  ${0##*/} -n data --pod-regex '^importer-[0-9]{8}-[0-9]{6}-[a-z0-9]{5}$'

  # Old behavior: spawn from CronJob and monitor newest pod:
  ${0##*/} -n data -c daily-import --interval 5
EOF
  exit 1
}

# --------------------------
# Parse args
# --------------------------
FOLLOW_NEWER="__unset__"

while [[ $# -gt 0 ]]; do
  case "$1" in
    -n|--namespace)   NAMESPACE="$2"; shift 2;;
    -c|--cronjob)     CRONJOB_NAME="$2"; shift 2;;
    -j|--job-prefix)  JOB_PREFIX="$2"; shift 2;;
    --pod-exact)      POD_EXACT="$2"; shift 2;;
    --pod-prefix)     POD_PREFIX="$2"; shift 2;;
    --pod-regex)      POD_REGEX="$2"; shift 2;;
    --container)      CONTAINER_NAME="$2"; shift 2;;
    --interval)       SAMPLE_INTERVAL="$2"; shift 2;;
    --wait)           WAIT_TIMEOUT="$2"; shift 2;;
    --follow-newer=*) FOLLOW_NEWER="${1#*=}"; shift 1;;
    --tue-sat-only)   TUE_SAT_ONLY="true"; shift;;
    -h|--help)        usage;;
    *) echo "Unknown arg: $1" >&2; usage;;
  esac
done

# --------------------------
# Validations & mode detect
# --------------------------
[[ -z "$NAMESPACE" ]] && { echo "[ERROR $(ts)] --namespace is required."; exit 2; }

MODE="unknown"
if [[ -n "$CRONJOB_NAME" ]]; then
  MODE="cronjob"
elif [[ -n "$POD_EXACT" || -n "$POD_PREFIX" || -n "$POD_REGEX" ]]; then
  MODE="podpattern"
else
  echo "[ERROR $(ts)] Specify either -c/--cronjob OR one of --pod-exact/--pod-prefix/--pod-regex."; exit 2
fi

# FOLLOW_NEWER default per mode
if [[ "$FOLLOW_NEWER" == "__unset__" ]]; then
  if [[ "$MODE" == "cronjob" ]]; then
    FOLLOW_NEWER="true"
  else
    # in pod-pattern mode: exact => false; prefix/regex => true
    if [[ -n "$POD_EXACT" ]]; then FOLLOW_NEWER="false"; else FOLLOW_NEWER="true"; fi
  fi
fi

# day gating optional
if [[ "$TUE_SAT_ONLY" == "true" ]]; then
  dow=$(date +%u); if [[ "$dow" -eq 7 || "$dow" -eq 1 ]]; then
    echo "[INFO  $(ts)] Skipping (Sun/Mon) due to --tue-sat-only."; exit 0; fi
fi

# cluster checks
"$KUBECTL" version --request-timeout=5s >/dev/null 2>&1 || { echo "[ERROR $(ts)] kubectl cannot reach cluster."; exit 3; }
"$KUBECTL" get ns "$NAMESPACE" >/dev/null 2>&1 || { echo "[ERROR $(ts)] Namespace '$NAMESPACE' not found or no access."; exit 4; }

# --------------------------
# Utility functions
# --------------------------
mem_to_bytes() {
  local v="$1"
  case "$v" in
    *Ki) echo $(( ${v%Ki} * 1024 ));;
    *Mi) echo $(( ${v%Mi} * 1024 * 1024 ));;
    *Gi) echo $(( ${v%Gi} * 1024 * 1024 * 1024 ));;
    *Ti) echo $(( ${v%Ti} * 1024 * 1024 * 1024 * 1024 ));;
    *K)  echo $(( ${v%K}  * 1000 ));;
    *M)  echo $(( ${v%M}  * 1000 * 1000 ));;
    *G)  echo $(( ${v%G}  * 1000 * 1000 * 1000 ));;
    *T)  echo $(( ${v%T}  * 1000 * 1000 * 1000 * 1000 ));;
    *)   echo "$v";;
  esac
}

get_newest_pod_by_job() {
  "$KUBECTL" -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \
    --sort-by=.metadata.creationTimestamp \
    -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | tail -n 1
}

get_pod_match() {
  # returns newest matching pod name for the chosen pattern mode
  local list
  list="$("$KUBECTL" -n "$NAMESPACE" get pods \
          --sort-by=.metadata.creationTimestamp \
          -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
  [[ -z "$list" ]] && return 1
  if [[ -n "$POD_EXACT" ]]; then
    echo "$list" | awk -v n="$POD_EXACT" '$0==n' | tail -n 1
  elif [[ -n "$POD_PREFIX" ]]; then
    echo "$list" | awk -v p="$POD_PREFIX" 'index($0,p)==1' | tail -n 1
  else
    echo "$list" | grep -E -- "$POD_REGEX" | tail -n 1
  fi
}

pod_waiting_reason() {
  "$KUBECTL" -n "$NAMESPACE" get pod "$1" -o jsonpath='{.status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true
}

# --------------------------
# Output folder & filenames
# --------------------------
NAME_FOR_FOLDER=""
if [[ "$MODE" == "cronjob" ]]; then
  NAME_FOR_FOLDER="${CRONJOB_NAME}"
else
  NAME_FOR_FOLDER="${POD_EXACT:-${POD_PREFIX:-${POD_REGEX//[^A-Za-z0-9_-]/_}}}"
fi

RUN_DIR="${OUTPUT_ROOT}/${TIMESTAMP}_${NAME_FOR_FOLDER}"
mkdir -p "$RUN_DIR"
POD_LOG="${RUN_DIR}/pod.log"
RES_CSV="${RUN_DIR}/resources.csv"
echo "[INFO  $(ts)] Output folder: $RUN_DIR"
echo "ts_utc,container,cpu_m,mem_bytes,node_cpu_pct,node_mem_bytes" > "$RES_CSV"

# --------------------------
# CronJob mode: create Job
# --------------------------
POD_NAME=""
NODE_NAME=""
JOB_NAME=""

if [[ "$MODE" == "cronjob" ]]; then
  "$KUBECTL" -n "$NAMESPACE" get cronjob "$CRONJOB_NAME" >/dev/null 2>&1 || { echo "[ERROR $(ts)] CronJob '$CRONJOB_NAME' not found."; exit 5; }
  JOB_PREFIX="${JOB_PREFIX:-$CRONJOB_NAME}"
  JOB_NAME="${JOB_PREFIX}-${TIMESTAMP}"
  echo "[INFO  $(ts)] Creating Job '$JOB_NAME' from CronJob '$CRONJOB_NAME' in ns '$NAMESPACE'..." | tee -a "$POD_LOG"
  "$KUBECTL" -n "$NAMESPACE" create job --from=cronjob/"$CRONJOB_NAME" "$JOB_NAME" >/dev/null \
    || { echo "[ERROR $(ts)] Failed to create Job."; exit 6; }
fi

# --------------------------
# Wait for the target pod
# --------------------------
echo "[INFO  $(ts)] Waiting for target pod..." | tee -a "$POD_LOG"
DEADLINE=$(( $(date +%s) + WAIT_TIMEOUT ))
while [[ -z "$POD_NAME" && $(date +%s) -lt $DEADLINE ]]; do
  if [[ "$MODE" == "cronjob" ]]; then
    POD_NAME="$(get_newest_pod_by_job)"
  else
    POD_NAME="$(get_pod_match)"
  fi
  [[ -z "$POD_NAME" ]] && sleep 1
done
[[ -z "$POD_NAME" ]] && { echo "[ERROR $(ts)] Timed out waiting for pod." | tee -a "$POD_LOG"; exit 7; }

NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
echo "[INFO  $(ts)] Monitoring pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" | tee -a "$POD_LOG"

# pick container hint
if [[ -z "$CONTAINER_NAME" ]]; then
  CONTAINER_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.containers[0].name}' 2>/dev/null || true)"
fi
echo "[INFO  $(ts)] Container (hint): ${CONTAINER_NAME:-<all>}" | tee -a "$POD_LOG"

# Metrics probe
METRICS_AVAILABLE=true
if ! "$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" >/dev/null 2>&1; then
  echo "[WARN  $(ts)] Metrics API not available; resources.csv will have blanks." | tee -a "$POD_LOG"
  METRICS_AVAILABLE=false
fi

# --------------------------
# Start log streaming
# --------------------------
{
  echo "[INFO  $(ts)] --- BEGIN POD LOG STREAM ---"
  if [[ -n "$CONTAINER_NAME" ]]; then
    "$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" -c "$CONTAINER_NAME" --timestamps
  else
    "$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" --all-containers --timestamps
  fi
  echo "[INFO  $(ts)] --- END POD LOG STREAM ---"
} >> "$POD_LOG" 2>&1 &
LOG_PID=$!

# --------------------------
# Sampling loop
# --------------------------
stop_sampling="false"
sample_loop() {
  while [[ "$stop_sampling" != "true" ]]; do
    # optionally follow newer pod if requested (for cronjob / prefix / regex cases)
    if [[ "$FOLLOW_NEWER" == "true" ]]; then
      if [[ "$MODE" == "cronjob" ]]; then
        latest="$(get_newest_pod_by_job)"
      else
        # exact => stay; prefix/regex => can move
        if [[ -n "$POD_EXACT" ]]; then latest="$POD_NAME"; else latest="$(get_pod_match)"; fi
      fi
      if [[ -n "$latest" && "$latest" != "$POD_NAME" ]]; then
        POD_NAME="$latest"
        NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
        echo "[INFO  $(ts)] Switched to pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" >> "$POD_LOG"
      fi
    fi

    TS="$(tsz)"
    NODE_CPU="" NODE_MEM_B=""

    if [[ "$METRICS_AVAILABLE" == true && -n "$NODE_NAME" ]]; then
      if read -r _ _ cpu_pct mem_raw _ < <("$KUBECTL" top node "$NODE_NAME" 2>/dev/null | awk 'NR==2{print $1,$2,$3,$4,$5}'); then
        NODE_CPU="${cpu_pct%%%}"
        NODE_MEM_B="$(mem_to_bytes "$mem_raw")"
      fi
    fi

    if [[ "$METRICS_AVAILABLE" == true && -n "$POD_NAME" ]]; then
      while read -r pod ctnr cpu_raw mem_raw _; do
        [[ -z "$ctnr" ]] && continue
        if [[ "$cpu_raw" == *m ]]; then CPU_M="${cpu_raw%m}"; else CPU_M=$(awk -v v="$cpu_raw" 'BEGIN{printf "%.0f", v*1000}'); fi
        MEM_B="$(mem_to_bytes "$mem_raw")"
        echo "$TS,$ctnr,$CPU_M,$MEM_B,${NODE_CPU},${NODE_MEM_B}" >> "$RES_CSV"
      done < <("$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" --containers 2>/dev/null | awk 'NR>1{print $1,$2,$3,$4,$5}')
    else
      echo "$TS,,,,," >> "$RES_CSV"
    fi

    sleep "$SAMPLE_INTERVAL"
  done
}
sample_loop & SAMP_PID=$!

# --------------------------
# Watch for end
# --------------------------
PHASE=""
while true; do
  # If following newer, re-evaluate current pod for phase; otherwise lock on POD_NAME
  TARGET="$POD_NAME"
  PHASE="$("$KUBECTL" -n "$NAMESPACE" get pod "$TARGET" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")"
  if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" || "$PHASE" == "NotFound" ]]; then
    break
  fi

  # Early exit on hard waiting reasons
  REASON="$(pod_waiting_reason "$TARGET" || true)"
  case "$REASON" in
    ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CrashLoopBackOff|CreateContainerError)
      echo "[ERROR $(ts)] Pod waiting reason: $REASON" | tee -a "$POD_LOG"
      break;;
  esac

  sleep 3
done

# --------------------------
# Stop background tasks
# --------------------------
stop_sampling="true"
kill "$SAMP_PID" >/dev/null 2>&1 || true
kill "$LOG_PID"  >/dev/null 2>&1 || true
wait "$SAMP_PID" "$LOG_PID" 2>/dev/null || true

# --------------------------
# Final messages
# --------------------------
echo "[INFO  $(ts)] Pod final phase: ${PHASE:-unknown}" | tee -a "$POD_LOG"
echo "[INFO  $(ts)] Output saved to: $RUN_DIR" | tee -a "$POD_LOG"

Enter fullscreen mode Exit fullscreen mode

Top comments (0)