#!/usr/bin/env bash
# monitor_pod_or_cronjob.sh
# Two modes:
# A) CronJob mode (give -c/--cronjob): create a Job from a CronJob and follow newest pod.
# B) Pod-pattern mode (give one of --pod-exact/--pod-prefix/--pod-regex): wait for a pod that matches and follow THAT pod precisely.
#
# Output: <script_dir>/output/<YYYYmmdd-HHMMSS>_<name>/{pod.log,resources.csv}
# Requirements: kubectl (metrics-server optional for CPU/mem numbers).
set -euo pipefail
# --------------------------
# Defaults (override by flags)
# --------------------------
NAMESPACE=""
CRONJOB_NAME="" # if set => CronJob mode
JOB_PREFIX="" # default => CRONJOB_NAME
POD_EXACT="" # if set => Pod-pattern mode (exact match)
POD_PREFIX="" # if set => Pod-pattern mode (prefix match)
POD_REGEX="" # if set => Pod-pattern mode (regex via grep -E)
LOCK_ONCE="true" # if true, never switch to another pod after locking; set --follow-newer=false to change
CONTAINER_NAME="" # specific container to stream logs from
SAMPLE_INTERVAL=5 # seconds between resource samples
WAIT_TIMEOUT=900 # seconds to wait for pod to appear
KUBECTL="${KUBECTL:-kubectl}"
TUE_SAT_ONLY="false"
# --------------------------
# Internals
# --------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_ROOT="${SCRIPT_DIR}/output"
TIMESTAMP="$(date +'%Y%m%d-%H%M%S')"
ts() { date +'%Y-%m-%d %H:%M:%S'; }
tsz() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }
usage() {
cat <<EOF
Usage:
# CronJob mode (create Job and monitor)
${0##*/} -n <ns> -c <cronjob> [--job-prefix PFX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=true|false]
# Pod-pattern mode (do NOT create anything; monitor a pod by name pattern)
${0##*/} -n <ns> [--pod-exact NAME | --pod-prefix PREFIX | --pod-regex REGEX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=false]
Options:
-n, --namespace NS Namespace (required in both modes)
-c, --cronjob NAME CronJob name (CronJob mode)
-j, --job-prefix PFX Prefix for created Job (default: cronjob name)
--pod-exact NAME Exact pod name (Pod-pattern mode)
--pod-prefix PREFIX Pod name starts with PREFIX (Pod-pattern mode)
--pod-regex REGEX Pod name matches REGEX (Pod-pattern mode; grep -E)
--container NAME Container to stream logs (default: first / all)
--interval SECS Metrics sample interval (default: 5)
--wait SECS Timeout waiting for pod (default: 900)
--follow-newer=BOOL If true (default in CronJob mode), switch to newer pods when they appear.
In Pod-pattern mode default is true for prefix/regex, false for exact.
--tue-sat-only Skip on Sun/Mon
-h, --help
Examples:
# Monitor a specific pod by exact name:
${0##*/} -n data --pod-exact importer-20251012-81kdb --interval 2
# Monitor the newest pod starting with a prefix:
${0##*/} -n data --pod-prefix importer- --interval 5 --follow-newer=false
# Monitor by regex:
${0##*/} -n data --pod-regex '^importer-[0-9]{8}-[0-9]{6}-[a-z0-9]{5}$'
# Old behavior: spawn from CronJob and monitor newest pod:
${0##*/} -n data -c daily-import --interval 5
EOF
exit 1
}
# --------------------------
# Parse args
# --------------------------
FOLLOW_NEWER="__unset__"
while [[ $# -gt 0 ]]; do
case "$1" in
-n|--namespace) NAMESPACE="$2"; shift 2;;
-c|--cronjob) CRONJOB_NAME="$2"; shift 2;;
-j|--job-prefix) JOB_PREFIX="$2"; shift 2;;
--pod-exact) POD_EXACT="$2"; shift 2;;
--pod-prefix) POD_PREFIX="$2"; shift 2;;
--pod-regex) POD_REGEX="$2"; shift 2;;
--container) CONTAINER_NAME="$2"; shift 2;;
--interval) SAMPLE_INTERVAL="$2"; shift 2;;
--wait) WAIT_TIMEOUT="$2"; shift 2;;
--follow-newer=*) FOLLOW_NEWER="${1#*=}"; shift 1;;
--tue-sat-only) TUE_SAT_ONLY="true"; shift;;
-h|--help) usage;;
*) echo "Unknown arg: $1" >&2; usage;;
esac
done
# --------------------------
# Validations & mode detect
# --------------------------
[[ -z "$NAMESPACE" ]] && { echo "[ERROR $(ts)] --namespace is required."; exit 2; }
MODE="unknown"
if [[ -n "$CRONJOB_NAME" ]]; then
MODE="cronjob"
elif [[ -n "$POD_EXACT" || -n "$POD_PREFIX" || -n "$POD_REGEX" ]]; then
MODE="podpattern"
else
echo "[ERROR $(ts)] Specify either -c/--cronjob OR one of --pod-exact/--pod-prefix/--pod-regex."; exit 2
fi
# FOLLOW_NEWER default per mode
if [[ "$FOLLOW_NEWER" == "__unset__" ]]; then
if [[ "$MODE" == "cronjob" ]]; then
FOLLOW_NEWER="true"
else
# in pod-pattern mode: exact => false; prefix/regex => true
if [[ -n "$POD_EXACT" ]]; then FOLLOW_NEWER="false"; else FOLLOW_NEWER="true"; fi
fi
fi
# day gating optional
if [[ "$TUE_SAT_ONLY" == "true" ]]; then
dow=$(date +%u); if [[ "$dow" -eq 7 || "$dow" -eq 1 ]]; then
echo "[INFO $(ts)] Skipping (Sun/Mon) due to --tue-sat-only."; exit 0; fi
fi
# cluster checks
"$KUBECTL" version --request-timeout=5s >/dev/null 2>&1 || { echo "[ERROR $(ts)] kubectl cannot reach cluster."; exit 3; }
"$KUBECTL" get ns "$NAMESPACE" >/dev/null 2>&1 || { echo "[ERROR $(ts)] Namespace '$NAMESPACE' not found or no access."; exit 4; }
# --------------------------
# Utility functions
# --------------------------
mem_to_bytes() {
local v="$1"
case "$v" in
*Ki) echo $(( ${v%Ki} * 1024 ));;
*Mi) echo $(( ${v%Mi} * 1024 * 1024 ));;
*Gi) echo $(( ${v%Gi} * 1024 * 1024 * 1024 ));;
*Ti) echo $(( ${v%Ti} * 1024 * 1024 * 1024 * 1024 ));;
*K) echo $(( ${v%K} * 1000 ));;
*M) echo $(( ${v%M} * 1000 * 1000 ));;
*G) echo $(( ${v%G} * 1000 * 1000 * 1000 ));;
*T) echo $(( ${v%T} * 1000 * 1000 * 1000 * 1000 ));;
*) echo "$v";;
esac
}
get_newest_pod_by_job() {
"$KUBECTL" -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \
--sort-by=.metadata.creationTimestamp \
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | tail -n 1
}
get_pod_match() {
# returns newest matching pod name for the chosen pattern mode
local list
list="$("$KUBECTL" -n "$NAMESPACE" get pods \
--sort-by=.metadata.creationTimestamp \
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
[[ -z "$list" ]] && return 1
if [[ -n "$POD_EXACT" ]]; then
echo "$list" | awk -v n="$POD_EXACT" '$0==n' | tail -n 1
elif [[ -n "$POD_PREFIX" ]]; then
echo "$list" | awk -v p="$POD_PREFIX" 'index($0,p)==1' | tail -n 1
else
echo "$list" | grep -E -- "$POD_REGEX" | tail -n 1
fi
}
pod_waiting_reason() {
"$KUBECTL" -n "$NAMESPACE" get pod "$1" -o jsonpath='{.status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true
}
# --------------------------
# Output folder & filenames
# --------------------------
NAME_FOR_FOLDER=""
if [[ "$MODE" == "cronjob" ]]; then
NAME_FOR_FOLDER="${CRONJOB_NAME}"
else
NAME_FOR_FOLDER="${POD_EXACT:-${POD_PREFIX:-${POD_REGEX//[^A-Za-z0-9_-]/_}}}"
fi
RUN_DIR="${OUTPUT_ROOT}/${TIMESTAMP}_${NAME_FOR_FOLDER}"
mkdir -p "$RUN_DIR"
POD_LOG="${RUN_DIR}/pod.log"
RES_CSV="${RUN_DIR}/resources.csv"
echo "[INFO $(ts)] Output folder: $RUN_DIR"
echo "ts_utc,container,cpu_m,mem_bytes,node_cpu_pct,node_mem_bytes" > "$RES_CSV"
# --------------------------
# CronJob mode: create Job
# --------------------------
POD_NAME=""
NODE_NAME=""
JOB_NAME=""
if [[ "$MODE" == "cronjob" ]]; then
"$KUBECTL" -n "$NAMESPACE" get cronjob "$CRONJOB_NAME" >/dev/null 2>&1 || { echo "[ERROR $(ts)] CronJob '$CRONJOB_NAME' not found."; exit 5; }
JOB_PREFIX="${JOB_PREFIX:-$CRONJOB_NAME}"
JOB_NAME="${JOB_PREFIX}-${TIMESTAMP}"
echo "[INFO $(ts)] Creating Job '$JOB_NAME' from CronJob '$CRONJOB_NAME' in ns '$NAMESPACE'..." | tee -a "$POD_LOG"
"$KUBECTL" -n "$NAMESPACE" create job --from=cronjob/"$CRONJOB_NAME" "$JOB_NAME" >/dev/null \
|| { echo "[ERROR $(ts)] Failed to create Job."; exit 6; }
fi
# --------------------------
# Wait for the target pod
# --------------------------
echo "[INFO $(ts)] Waiting for target pod..." | tee -a "$POD_LOG"
DEADLINE=$(( $(date +%s) + WAIT_TIMEOUT ))
while [[ -z "$POD_NAME" && $(date +%s) -lt $DEADLINE ]]; do
if [[ "$MODE" == "cronjob" ]]; then
POD_NAME="$(get_newest_pod_by_job)"
else
POD_NAME="$(get_pod_match)"
fi
[[ -z "$POD_NAME" ]] && sleep 1
done
[[ -z "$POD_NAME" ]] && { echo "[ERROR $(ts)] Timed out waiting for pod." | tee -a "$POD_LOG"; exit 7; }
NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
echo "[INFO $(ts)] Monitoring pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" | tee -a "$POD_LOG"
# pick container hint
if [[ -z "$CONTAINER_NAME" ]]; then
CONTAINER_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.containers[0].name}' 2>/dev/null || true)"
fi
echo "[INFO $(ts)] Container (hint): ${CONTAINER_NAME:-<all>}" | tee -a "$POD_LOG"
# Metrics probe
METRICS_AVAILABLE=true
if ! "$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" >/dev/null 2>&1; then
echo "[WARN $(ts)] Metrics API not available; resources.csv will have blanks." | tee -a "$POD_LOG"
METRICS_AVAILABLE=false
fi
# --------------------------
# Start log streaming
# --------------------------
{
echo "[INFO $(ts)] --- BEGIN POD LOG STREAM ---"
if [[ -n "$CONTAINER_NAME" ]]; then
"$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" -c "$CONTAINER_NAME" --timestamps
else
"$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" --all-containers --timestamps
fi
echo "[INFO $(ts)] --- END POD LOG STREAM ---"
} >> "$POD_LOG" 2>&1 &
LOG_PID=$!
# --------------------------
# Sampling loop
# --------------------------
stop_sampling="false"
sample_loop() {
while [[ "$stop_sampling" != "true" ]]; do
# optionally follow newer pod if requested (for cronjob / prefix / regex cases)
if [[ "$FOLLOW_NEWER" == "true" ]]; then
if [[ "$MODE" == "cronjob" ]]; then
latest="$(get_newest_pod_by_job)"
else
# exact => stay; prefix/regex => can move
if [[ -n "$POD_EXACT" ]]; then latest="$POD_NAME"; else latest="$(get_pod_match)"; fi
fi
if [[ -n "$latest" && "$latest" != "$POD_NAME" ]]; then
POD_NAME="$latest"
NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)"
echo "[INFO $(ts)] Switched to pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" >> "$POD_LOG"
fi
fi
TS="$(tsz)"
NODE_CPU="" NODE_MEM_B=""
if [[ "$METRICS_AVAILABLE" == true && -n "$NODE_NAME" ]]; then
if read -r _ _ cpu_pct mem_raw _ < <("$KUBECTL" top node "$NODE_NAME" 2>/dev/null | awk 'NR==2{print $1,$2,$3,$4,$5}'); then
NODE_CPU="${cpu_pct%%%}"
NODE_MEM_B="$(mem_to_bytes "$mem_raw")"
fi
fi
if [[ "$METRICS_AVAILABLE" == true && -n "$POD_NAME" ]]; then
while read -r pod ctnr cpu_raw mem_raw _; do
[[ -z "$ctnr" ]] && continue
if [[ "$cpu_raw" == *m ]]; then CPU_M="${cpu_raw%m}"; else CPU_M=$(awk -v v="$cpu_raw" 'BEGIN{printf "%.0f", v*1000}'); fi
MEM_B="$(mem_to_bytes "$mem_raw")"
echo "$TS,$ctnr,$CPU_M,$MEM_B,${NODE_CPU},${NODE_MEM_B}" >> "$RES_CSV"
done < <("$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" --containers 2>/dev/null | awk 'NR>1{print $1,$2,$3,$4,$5}')
else
echo "$TS,,,,," >> "$RES_CSV"
fi
sleep "$SAMPLE_INTERVAL"
done
}
sample_loop & SAMP_PID=$!
# --------------------------
# Watch for end
# --------------------------
PHASE=""
while true; do
# If following newer, re-evaluate current pod for phase; otherwise lock on POD_NAME
TARGET="$POD_NAME"
PHASE="$("$KUBECTL" -n "$NAMESPACE" get pod "$TARGET" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")"
if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" || "$PHASE" == "NotFound" ]]; then
break
fi
# Early exit on hard waiting reasons
REASON="$(pod_waiting_reason "$TARGET" || true)"
case "$REASON" in
ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CrashLoopBackOff|CreateContainerError)
echo "[ERROR $(ts)] Pod waiting reason: $REASON" | tee -a "$POD_LOG"
break;;
esac
sleep 3
done
# --------------------------
# Stop background tasks
# --------------------------
stop_sampling="true"
kill "$SAMP_PID" >/dev/null 2>&1 || true
kill "$LOG_PID" >/dev/null 2>&1 || true
wait "$SAMP_PID" "$LOG_PID" 2>/dev/null || true
# --------------------------
# Final messages
# --------------------------
echo "[INFO $(ts)] Pod final phase: ${PHASE:-unknown}" | tee -a "$POD_LOG"
echo "[INFO $(ts)] Output saved to: $RUN_DIR" | tee -a "$POD_LOG"
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)