Files
cds-ai/atvm/scripts/run-atvm-setup-and-collect-log.sh
anthony.wen 274b920b40 Reorganize ATVM workspace into scripts, docs, inventory, and archive
Restructure the ATVM folder to separate executable scripts from workflow documentation and long-form environment reference material.

Move setup and automation scripts into scripts/, move setup and automation guides into docs/, add top-level README and workflow conventions, and organize durable environment details into inventory/ while preserving the original long-form ATVM notes under archive/imported-notes/.

Update internal documentation paths to match the new layout and remove the archived Zone.Identifier metadata file.
2026-03-21 20:39:23 -04:00

229 lines
7.4 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
REMOTE_IP_PRIMARY="${REMOTE_IP_PRIMARY:-192.168.0.121}"
REMOTE_IP_SECONDARY="${REMOTE_IP_SECONDARY:-192.168.3.191}"
REMOTE_USER="${REMOTE_USER:-root}"
PROJECT_DIR="${PROJECT_DIR:-/home/aw/code/atvm}"
LOCAL_LOG_DIR="${LOCAL_LOG_DIR:-$PROJECT_DIR/log}"
LOCAL_SETUP_SCRIPT="${LOCAL_SETUP_SCRIPT:-$PROJECT_DIR/atvm_setup_script.sh}"
REMOTE_SETUP_SCRIPT="${REMOTE_SETUP_SCRIPT:-/root/atvm_setup_script.sh}"
REMOTE_LOG_FILE="${REMOTE_LOG_FILE:-/root/atvm_setup_script.log}"
WAIT_TIMEOUT_SECONDS="${WAIT_TIMEOUT_SECONDS:-600}"
MODE="${1:-run-and-collect}"
EXPECTED_IP_ARG="${EXPECTED_IP_ARG:-}"
EXPECTED_HOSTNAME_ARG="${EXPECTED_HOSTNAME_ARG:-}"
SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5)
if [[ ! -f "$LOCAL_SETUP_SCRIPT" ]]; then
echo "ERROR: Local setup script not found: $LOCAL_SETUP_SCRIPT" >&2
exit 1
fi
mkdir -p "$LOCAL_LOG_DIR"
if ! command -v ssh >/dev/null 2>&1 || ! command -v scp >/dev/null 2>&1; then
echo "ERROR: ssh/scp is required." >&2
exit 1
fi
SSH_CMD=(ssh "${SSH_OPTS[@]}")
SCP_CMD=(scp "${SSH_OPTS[@]}")
if [[ -n "${ATVM_PASSWORD:-}" ]]; then
if command -v sshpass >/dev/null 2>&1; then
SSH_CMD=(sshpass -p "$ATVM_PASSWORD" ssh "${SSH_OPTS[@]}")
SCP_CMD=(sshpass -p "$ATVM_PASSWORD" scp "${SSH_OPTS[@]}")
else
echo "WARNING: ATVM_PASSWORD is set, but sshpass is not installed. Falling back to interactive password prompts."
fi
fi
run_ssh() {
local host="$1"
shift
"${SSH_CMD[@]}" "${REMOTE_USER}@${host}" "$@"
}
run_scp_to_remote() {
local src="$1"
local host="$2"
local dst="$3"
"${SCP_CMD[@]}" "$src" "${REMOTE_USER}@${host}:${dst}"
}
run_scp_from_remote() {
local host="$1"
local src="$2"
local dst="$3"
"${SCP_CMD[@]}" "${REMOTE_USER}@${host}:${src}" "$dst"
}
wait_for_reachable_host() {
local start_ts current_ts elapsed
start_ts="$(date +%s)"
while true; do
for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do
if run_ssh "$host" "echo ready" >/dev/null 2>&1; then
echo "$host"
return 0
fi
done
current_ts="$(date +%s)"
elapsed=$((current_ts - start_ts))
if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then
return 1
fi
sleep 5
done
}
pick_initial_host() {
for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do
if run_ssh "$host" "echo ready" >/dev/null 2>&1; then
echo "$host"
return 0
fi
done
return 1
}
wait_for_completed_task() {
local start_ts current_ts elapsed
start_ts="$(date +%s)"
while true; do
for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do
if run_ssh "$host" "test -f '$REMOTE_LOG_FILE' && grep -q 'SUCCESS: ATVM VM Setup Complete!' '$REMOTE_LOG_FILE'" >/dev/null 2>&1; then
echo "$host"
return 0
fi
done
current_ts="$(date +%s)"
elapsed=$((current_ts - start_ts))
if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then
return 1
fi
sleep 5
done
}
wait_for_host_offline() {
local host="$1"
local start_ts current_ts elapsed
start_ts="$(date +%s)"
while true; do
if ! run_ssh "$host" "echo still-up" >/dev/null 2>&1; then
return 0
fi
current_ts="$(date +%s)"
elapsed=$((current_ts - start_ts))
if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then
return 1
fi
sleep 5
done
}
if [[ "$MODE" != "run-and-collect" && "$MODE" != "--collect-after-complete" ]]; then
echo "Usage:"
echo " $0 # run setup on client, then collect log"
echo " $0 --collect-after-complete # wait for completed client task, then collect log only"
exit 1
fi
if [[ "$MODE" == "run-and-collect" ]]; then
if [[ -z "$EXPECTED_IP_ARG" || -z "$EXPECTED_HOSTNAME_ARG" ]]; then
echo "ERROR: run-and-collect requires EXPECTED_IP_ARG and EXPECTED_HOSTNAME_ARG." >&2
echo "Example:" >&2
echo " EXPECTED_IP_ARG=192.168.0.121 EXPECTED_HOSTNAME_ARG=atvm-codextest-vm $0" >&2
exit 1
fi
INITIAL_HOST="$(pick_initial_host)" || {
echo "ERROR: Could not reach ${REMOTE_IP_PRIMARY} or ${REMOTE_IP_SECONDARY} for initial setup." >&2
exit 1
}
echo "Copying setup script to ${REMOTE_USER}@${INITIAL_HOST}:${REMOTE_SETUP_SCRIPT}"
run_scp_to_remote "$LOCAL_SETUP_SCRIPT" "$INITIAL_HOST" "$REMOTE_SETUP_SCRIPT"
echo "Running remote setup script on ${INITIAL_HOST} (disconnect is expected during IP/reboot steps)"
set +e
run_ssh "$INITIAL_HOST" "chmod +x '$REMOTE_SETUP_SCRIPT' && bash '$REMOTE_SETUP_SCRIPT' --expected-ip '$EXPECTED_IP_ARG' --expected-hostname '$EXPECTED_HOSTNAME_ARG'"
run_status=$?
set -e
if (( run_status != 0 )); then
echo "INFO: Remote run returned non-zero (${run_status}). Continuing because network reconfiguration/reboot can interrupt SSH."
fi
echo "Waiting for completed client task marker in ${REMOTE_LOG_FILE} (timeout: ${WAIT_TIMEOUT_SECONDS}s)"
ACTIVE_HOST="$(wait_for_completed_task)" || {
echo "ERROR: Could not detect completed task marker in remote log within timeout." >&2
exit 1
}
else
echo "Waiting for completed client task marker in ${REMOTE_LOG_FILE} (timeout: ${WAIT_TIMEOUT_SECONDS}s)"
ACTIVE_HOST="$(wait_for_completed_task)" || {
echo "ERROR: Could not detect completed task marker in remote log within timeout." >&2
exit 1
}
fi
echo "Host reachable at: ${ACTIVE_HOST}"
REMOTE_HOSTNAME="$(run_ssh "$ACTIVE_HOST" "hostname" | tr -d '\r' | tail -n1)"
RUN_TS="$(date +%Y%m%d_%H%M%S)"
LOCAL_LOG_FILE="${LOCAL_LOG_DIR}/atvm_configuration_${REMOTE_HOSTNAME}_${RUN_TS}.log"
echo "Collecting remote log: ${REMOTE_LOG_FILE}"
run_scp_from_remote "$ACTIVE_HOST" "$REMOTE_LOG_FILE" "$LOCAL_LOG_FILE"
REMOTE_HASH="$(run_ssh "$ACTIVE_HOST" "sha256sum '$REMOTE_LOG_FILE' | awk '{print \$1}'" | tr -d '\r' | tail -n1)"
LOCAL_HASH="$(sha256sum "$LOCAL_LOG_FILE" | awk '{print $1}')"
if [[ "$REMOTE_HASH" != "$LOCAL_HASH" ]]; then
echo "ERROR: Hash mismatch after log copy." >&2
echo "Remote: $REMOTE_HASH" >&2
echo "Local: $LOCAL_HASH" >&2
exit 1
fi
HAS_ERRORS_IN_LOG=false
# Match only real error log records. Do not match instructional text that mentions "[ERROR]".
if run_ssh "$ACTIVE_HOST" "grep -Eq '^\\[ERROR\\]' '$REMOTE_LOG_FILE'"; then
HAS_ERRORS_IN_LOG=true
fi
if [[ "$HAS_ERRORS_IN_LOG" == true ]]; then
echo "WARNING: [ERROR] entries detected in remote log. VM will remain powered on for manual inspection."
else
echo "Log indicates success with no [ERROR] entries. Powering off ${ACTIVE_HOST}."
set +e
run_ssh "$ACTIVE_HOST" "shutdown -h now"
shutdown_status=$?
set -e
if (( shutdown_status != 0 )); then
echo "INFO: Shutdown command returned non-zero (${shutdown_status}); this can occur if SSH disconnects during shutdown."
fi
echo "Waiting for ${ACTIVE_HOST} to go offline (timeout: ${WAIT_TIMEOUT_SECONDS}s)"
if wait_for_host_offline "$ACTIVE_HOST"; then
echo "Power-off confirmed: ${ACTIVE_HOST} is offline."
else
echo "WARNING: Could not confirm ${ACTIVE_HOST} offline within timeout."
fi
fi
echo "Success"
echo "Active host: ${ACTIVE_HOST}"
echo "Local log: ${LOCAL_LOG_FILE}"
echo "SHA256: ${LOCAL_HASH}"