DEV Community

ByteLedger
ByteLedger

Posted on

Monitor pods in K8

#!/usr/bin/env bash
set -euo pipefail

# ==========================================================
# WHAT TO CHANGE (or pass via flags when running)
# ----------------------------------------------------------
# NAMESPACE      : Kubernetes namespace where the CronJob lives.
# CRONJOB_NAME   : The CronJob you want to trigger (manual run).
# JOB_PREFIX     : Optional prefix for the created Job name. If empty, uses CRONJOB_NAME.
# SAMPLE_INTERVAL: How often (seconds) to record CPU/Memory metrics.
# WAIT_POD_TIMEOUT: How long (seconds) to wait for the Job's Pod to appear.
# CONTAINER_NAME : Optional; container to log if the Pod has multiple containers.
#
# You can edit the defaults below OR pass flags:
#   -n/--namespace <ns>    -c/--cronjob <name>   -j/--job-prefix <prefix>
#   --interval <secs>      --wait <secs>         --container <name>
#   --tue-sat-only         (skip Sun/Mon)
#
# Output goes into:  <script_dir>/output/<YYYYmmdd-HHMMSS>_<jobPrefix>/
# Files produced:    pod.log   and   resources.csv
# ==========================================================

# ---- Defaults (replace with your values, or override with flags) ----
NAMESPACE="your-namespace"         # e.g., "prod", "staging", "data"
CRONJOB_NAME="your-cronjob-name"   # e.g., "daily-import", "billing-job"
JOB_PREFIX=""                      # leave empty to default to CRONJOB_NAME
SAMPLE_INTERVAL=10                 # seconds between metric samples
WAIT_POD_TIMEOUT=900               # seconds to wait for pod creation
KUBECTL="${KUBECTL:-kubectl}"      # path to kubectl (auto if in PATH)
CONTAINER_NAME=""                  # optional: pick container; default=first container
TUE_SAT_ONLY="false"               # set true via --tue-sat-only

# ---- Resolve script location & set output root next to script ----
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_ROOT="$SCRIPT_DIR/output"   # Output folder created alongside this script

usage() {
  cat <<EOF
Usage: ${0##*/} [-n NS] [-c CRON] [-j JOBPREFIX] [--container NAME]
                [--interval SECS] [--wait SECS] [--tue-sat-only]

Examples:
  ${0##*/} -n data -c daily-import --interval 5
  ${0##*/} -n prod -c billing-job -j billing --container worker

Flags:
  -n, --namespace     Kubernetes namespace (default: "$NAMESPACE")
  -c, --cronjob       CronJob name (default: "$CRONJOB_NAME")
  -j, --job-prefix    Job name prefix (default: CRONJOB_NAME)
      --container     Specific container to log (default: first container)
      --interval      Metrics sample interval seconds (default: $SAMPLE_INTERVAL)
      --wait          Timeout waiting for pod seconds (default: $WAIT_POD_TIMEOUT)
      --tue-sat-only  Only run Tue–Sat (skip Sun/Mon)
  -h, --help
EOF
  exit 1
}

ts()  { date +'%Y-%m-%d %H:%M:%S'; }
tsz() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }

# ---- Parse command-line flags ----
while [[ $# -gt 0 ]]; do
  case "$1" in
    -n|--namespace)   NAMESPACE="$2"; shift 2;;
    -c|--cronjob)     CRONJOB_NAME="$2"; shift 2;;
    -j|--job-prefix)  JOB_PREFIX="$2"; shift 2;;
    --container)      CONTAINER_NAME="$2"; shift 2;;
    --interval)       SAMPLE_INTERVAL="$2"; shift 2;;
    --wait)           WAIT_POD_TIMEOUT="$2"; shift 2;;
    --tue-sat-only)   TUE_SAT_ONLY="true"; shift;;
    -h|--help)        usage;;
    *) echo "Unknown arg: $1" >&2; usage;;
  esac
done

# ---- Basic validation ----
[[ -z "$NAMESPACE" || "$NAMESPACE" == "your-namespace" ]] && {
  echo "[ERROR $(ts)] Please set a real namespace via -n/--namespace." >&2; exit 2; }
[[ -z "$CRONJOB_NAME" || "$CRONJOB_NAME" == "your-cronjob-name" ]] && {
  echo "[ERROR $(ts)] Please set a real CronJob name via -c/--cronjob." >&2; exit 2; }

# ---- Cluster reachability & object checks ----
if ! "$KUBECTL" version --request-timeout=5s >/dev/null 2>&1; then
  echo "[ERROR $(ts)] kubectl cannot connect to the cluster." >&2; exit 3
fi
if ! "$KUBECTL" get ns "$NAMESPACE" >/dev/null 2>&1; then
  echo "[ERROR $(ts)] Namespace '$NAMESPACE' not found or not permitted." >&2; exit 4
fi
if ! "$KUBECTL" -n "$NAMESPACE" get cronjob "$CRONJOB_NAME" >/dev/null 2>&1; then
  echo "[ERROR $(ts)] CronJob '$CRONJOB_NAME' not found in namespace '$NAMESPACE'." >&2; exit 5
fi

# ---- Optional day gating (Tue–Sat) ----
if [[ "$TUE_SAT_ONLY" == "true" ]]; then
  dow=$(date +%u) # 1=Mon..7=Sun
  if [[ "$dow" -eq 7 || "$dow" -eq 1 ]]; then
    echo "[INFO  $(ts)] Skipping run (Sun/Mon) due to --tue-sat-only."
    exit 0
  fi
fi

# ---- Prepare run folder & filenames ----
TIMESTAMP="$(date +'%Y%m%d-%H%M%S')"
JOB_PREFIX="${JOB_PREFIX:-$CRONJOB_NAME}"
RUN_DIR="$OUTPUT_ROOT/${TIMESTAMP}_${JOB_PREFIX}"
mkdir -p "$RUN_DIR"

POD_LOG="$RUN_DIR/pod.log"
RES_CSV="$RUN_DIR/resources.csv"

echo "[INFO  $(ts)] Output folder: $RUN_DIR"
echo "ts_utc,container,cpu_m,mem_bytes,node_cpu_pct,node_mem_bytes" > "$RES_CSV"

# ---- Create Job from CronJob ----
echo "[INFO  $(ts)] Creating Job from CronJob '$CRONJOB_NAME' in namespace '$NAMESPACE'..." | tee -a "$POD_LOG"
JOB_NAME="${JOB_PREFIX}-${TIMESTAMP}"
if ! "$KUBECTL" -n "$NAMESPACE" create job --from=cronjob/"$CRONJOB_NAME" "$JOB_NAME" >/dev/null; then
  echo "[ERROR $(ts)] Failed to create Job (name collision or RBAC?)." | tee -a "$POD_LOG"
  exit 6
fi

# ---- Wait for the Job's Pod to appear ----
echo "[INFO  $(ts)] Waiting for Pod (label job-name=$JOB_NAME) ..." | tee -a "$POD_LOG"
POD_NAME=""
DEADLINE=$(( $(date +%s) + WAIT_POD_TIMEOUT ))
while [[ -z "$POD_NAME" && $(date +%s) -lt $DEADLINE ]]; do
  POD_NAME="$("$KUBECTL" -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \
             -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
  [[ -z "$POD_NAME" ]] && sleep 1
end
[[ -z "$POD_NAME" ]] && { echo "[ERROR $(ts)] Timed out waiting for Pod." | tee -a "$POD_LOG"; exit 7; }

NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
echo "[INFO  $(ts)] Pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" | tee -a "$POD_LOG"

# ---- Decide which container to log ----
if [[ -z "$CONTAINER_NAME" ]]; then
  CONTAINER_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.containers[0].name}' 2>/dev/null || true)"
fi
echo "[INFO  $(ts)] Monitoring container: ${CONTAINER_NAME:-<first-container>}" | tee -a "$POD_LOG"

# ---- Check Metrics API availability (best-effort) ----
METRICS_OK="true"
if ! "$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" >/dev/null 2>&1; then
  echo "[WARN  $(ts)] Metrics API not available; resources.csv will be minimal." | tee -a "$POD_LOG"
  METRICS_OK="false"
fi

# ---- Background 1: stream pod logs into pod.log ----
{
  echo "[INFO  $(ts)] --- BEGIN POD LOG STREAM ---"
  "$KUBECTL" -n "$NAMESPACE" logs -f "$POD_NAME" ${CONTAINER_NAME:+-c "$CONTAINER_NAME"} --timestamps
  echo "[INFO  $(ts)] --- END POD LOG STREAM ---"
} >> "$POD_LOG" 2>&1 &
LOG_PID=$!

# ---- Background 2: sample consolidated resource usage into resources.csv ----
stop_sampling="false"
sample_loop() {
  while [[ "$stop_sampling" != "true" ]]; do
    TS="$(tsz)"

    # Default (empty) values
    CONTAINER=""; CPU_M=""; MEM_B=""; NODE_CPU=""; NODE_MEM_B=""

    if [[ "$METRICS_OK" == "true" ]]; then
      # Per-container metrics
      # Format from `kubectl top pod --containers`:
      # NAME  CPU(cores)  MEMORY(bytes)  (headers)
      # <pod> <container> <cpu>          <mem>
      line="$("$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" --containers 2>/dev/null | awk 'NR==2{print}')"
      if [[ -n "$line" ]]; then
        # tokenize safely
        set -- $line
        # Depending on kubectl version, columns may be: POD CONTAINER CPU MEM
        # Try to detect container position:
        # (Pod name equals $POD_NAME; container likely $2)
        CONTAINER="$2"
        CPU_RAW="$3"   # e.g., "250m" or "0" or "1"
        MEM_RAW="$4"   # e.g., "256Mi" or "123456Ki" or "1Gi"

        # CPU → millicores
        if [[ "$CPU_RAW" == *m ]]; then
          CPU_M="${CPU_RAW%m}"
        else
          # assume cores, convert to millicores (e.g., "1" -> 1000)
          CPU_M=$(awk -v v="$CPU_RAW" 'BEGIN{printf "%.0f", v*1000}')
        fi

        # Memory → bytes (handles Ki/Mi/Gi/Ti or K/M/G/T)
        MEM_B="$MEM_RAW"
        case "$MEM_B" in
          *Ki) MEM_B=$(( ${MEM_B%Ki} * 1024 ));;
          *Mi) MEM_B=$(( ${MEM_B%Mi} * 1024 * 1024 ));;
          *Gi) MEM_B=$(( ${MEM_B%Gi} * 1024 * 1024 * 1024 ));;
          *Ti) MEM_B=$(( ${MEM_B%Ti} * 1024 * 1024 * 1024 * 1024 ));;
          *K)  MEM_B=$(( ${MEM_B%K}  * 1000 ));;
          *M)  MEM_B=$(( ${MEM_B%M}  * 1000 * 1000 ));;
          *G)  MEM_B=$(( ${MEM_B%G}  * 1000 * 1000 * 1000 ));;
          *T)  MEM_B=$(( ${MEM_B%T}  * 1000 * 1000 * 1000 * 1000 ));;
        esac
      fi

      # Node metrics: `kubectl top node <node>`
      if [[ -n "${NODE_NAME:-}" ]]; then
        nline="$("$KUBECTL" top node "$NODE_NAME" 2>/dev/null | awk 'NR==2{print}')"
        if [[ -n "$nline" ]]; then
          set -- $nline
          # columns: NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
          NODE_CPU="${3%%%}"
          N_MEM_RAW="$5"
          case "$N_MEM_RAW" in
            *Ki) NODE_MEM_B=$(( ${N_MEM_RAW%Ki} * 1024 ));;
            *Mi) NODE_MEM_B=$(( ${N_MEM_RAW%Mi} * 1024 * 1024 ));;
            *Gi) NODE_MEM_B=$(( ${N_MEM_RAW%Gi} * 1024 * 1024 * 1024 ));;
            *Ti) NODE_MEM_B=$(( ${N_MEM_RAW%Ti} * 1024 * 1024 * 1024 * 1024 ));;
            *K)  NODE_MEM_B=$(( ${N_MEM_RAW%K}  * 1000 ));;
            *M)  NODE_MEM_B=$(( ${N_MEM_RAW%M}  * 1000 * 1000 ));;
            *G)  NODE_MEM_B=$(( ${N_MEM_RAW%G}  * 1000 * 1000 * 1000 ));;
            *T)  NODE_MEM_B=$(( ${N_MEM_RAW%T}  * 1000 * 1000 * 1000 * 1000 ));;
            *)   NODE_MEM_B="$N_MEM_RAW";;
          esac
        fi
      fi
    fi

    echo "$TS,${CONTAINER},${CPU_M},${MEM_B},${NODE_CPU},${NODE_MEM_B}" >> "$RES_CSV"
    sleep "$SAMPLE_INTERVAL"
  done
}
sample_loop & SAMP_PID=$!

# ---- Wait until pod completes (Succeeded/Failed) ----
PHASE=""
while true; do
  PHASE="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")"
  if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" || "$PHASE" == "NotFound" ]]; then
    break
  fi
  sleep 3
done

# ---- Stop background tasks ----
stop_sampling="true"
kill "$SAMP_PID" >/dev/null 2>&1 || true
kill "$LOG_PID"  >/dev/null 2>&1 || true
wait "$SAMP_PID" "$LOG_PID" 2>/dev/null || true

# ---- Final status lines in pod.log & completion ----
echo "[INFO  $(ts)] Pod finished with phase: $PHASE" | tee -a "$POD_LOG"
if [[ "$PHASE" == "Failed" ]]; then
  echo "[ERROR $(ts)] Job ended with Failed state." | tee -a "$POD_LOG"
  exit 1
else
  echo "[OK    $(ts)] Job completed successfully." | tee -a "$POD_LOG"
fi

echo "[INFO  $(ts)] Output saved to: $RUN_DIR" | tee -a "$POD_LOG"

Enter fullscreen mode Exit fullscreen mode

Top comments (0)