#!/usr/bin/env bash set -euo pipefail WORKSPACE_ROOT="${WORKSPACE_ROOT:-/home/aw/code/cds}" ENV_CREDENTIALS_FILE="${ENV_CREDENTIALS_FILE:-$WORKSPACE_ROOT/.env.credentials.local}" if [[ -f "$ENV_CREDENTIALS_FILE" ]]; then # Load local-only credential defaults for controller-side SSH and remote setup. # shellcheck disable=SC1090 source "$ENV_CREDENTIALS_FILE" fi REMOTE_IP_PRIMARY="${REMOTE_IP_PRIMARY:-192.168.0.121}" REMOTE_IP_SECONDARY="${REMOTE_IP_SECONDARY:-192.168.3.191}" REMOTE_USER="${REMOTE_USER:-${ATVM_TARGET_USER:-root}}" PROJECT_DIR="${PROJECT_DIR:-/home/aw/code/atvm}" LOCAL_LOG_DIR="${LOCAL_LOG_DIR:-$PROJECT_DIR/log}" LOCAL_SETUP_SCRIPT="${LOCAL_SETUP_SCRIPT:-$PROJECT_DIR/atvm_setup_script.sh}" REMOTE_SETUP_SCRIPT="${REMOTE_SETUP_SCRIPT:-/root/atvm_setup_script.sh}" REMOTE_LOG_FILE="${REMOTE_LOG_FILE:-/root/atvm_setup_script.log}" WAIT_TIMEOUT_SECONDS="${WAIT_TIMEOUT_SECONDS:-600}" MODE="${1:-run-and-collect}" EXPECTED_IP_ARG="${EXPECTED_IP_ARG:-}" EXPECTED_HOSTNAME_ARG="${EXPECTED_HOSTNAME_ARG:-}" ATVM_PASSWORD="${ATVM_PASSWORD:-${ATVM_TARGET_PASSWORD:-}}" SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5) if [[ ! -f "$LOCAL_SETUP_SCRIPT" ]]; then echo "ERROR: Local setup script not found: $LOCAL_SETUP_SCRIPT" >&2 exit 1 fi mkdir -p "$LOCAL_LOG_DIR" if ! command -v ssh >/dev/null 2>&1 || ! command -v scp >/dev/null 2>&1; then echo "ERROR: ssh/scp is required." >&2 exit 1 fi SSH_CMD=(ssh "${SSH_OPTS[@]}") SCP_CMD=(scp "${SSH_OPTS[@]}") if [[ -n "${ATVM_PASSWORD:-}" ]]; then if command -v sshpass >/dev/null 2>&1; then SSH_CMD=(sshpass -p "$ATVM_PASSWORD" ssh "${SSH_OPTS[@]}") SCP_CMD=(sshpass -p "$ATVM_PASSWORD" scp "${SSH_OPTS[@]}") else echo "WARNING: ATVM_PASSWORD is set, but sshpass is not installed. Falling back to interactive password prompts." fi fi run_ssh() { local host="$1" shift "${SSH_CMD[@]}" "${REMOTE_USER}@${host}" "$@" } run_scp_to_remote() { local src="$1" local host="$2" local dst="$3" "${SCP_CMD[@]}" "$src" "${REMOTE_USER}@${host}:${dst}" } run_scp_from_remote() { local host="$1" local src="$2" local dst="$3" "${SCP_CMD[@]}" "${REMOTE_USER}@${host}:${src}" "$dst" } wait_for_reachable_host() { local start_ts current_ts elapsed start_ts="$(date +%s)" while true; do for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do if run_ssh "$host" "echo ready" >/dev/null 2>&1; then echo "$host" return 0 fi done current_ts="$(date +%s)" elapsed=$((current_ts - start_ts)) if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then return 1 fi sleep 5 done } pick_initial_host() { for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do if run_ssh "$host" "echo ready" >/dev/null 2>&1; then echo "$host" return 0 fi done return 1 } wait_for_completed_task() { local start_ts current_ts elapsed start_ts="$(date +%s)" while true; do for host in "$REMOTE_IP_PRIMARY" "$REMOTE_IP_SECONDARY"; do if run_ssh "$host" "test -f '$REMOTE_LOG_FILE' && grep -q 'SUCCESS: ATVM VM Setup Complete!' '$REMOTE_LOG_FILE'" >/dev/null 2>&1; then echo "$host" return 0 fi done current_ts="$(date +%s)" elapsed=$((current_ts - start_ts)) if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then return 1 fi sleep 5 done } wait_for_host_offline() { local host="$1" local start_ts current_ts elapsed start_ts="$(date +%s)" while true; do if ! run_ssh "$host" "echo still-up" >/dev/null 2>&1; then return 0 fi current_ts="$(date +%s)" elapsed=$((current_ts - start_ts)) if (( elapsed >= WAIT_TIMEOUT_SECONDS )); then return 1 fi sleep 5 done } if [[ "$MODE" != "run-and-collect" && "$MODE" != "--collect-after-complete" ]]; then echo "Usage:" echo " $0 # run setup on client, then collect log" echo " $0 --collect-after-complete # wait for completed client task, then collect log only" exit 1 fi if [[ "$MODE" == "run-and-collect" ]]; then if [[ -z "$EXPECTED_IP_ARG" || -z "$EXPECTED_HOSTNAME_ARG" ]]; then echo "ERROR: run-and-collect requires EXPECTED_IP_ARG and EXPECTED_HOSTNAME_ARG." >&2 echo "Example:" >&2 echo " EXPECTED_IP_ARG=192.168.0.121 EXPECTED_HOSTNAME_ARG=atvm-codextest-vm $0" >&2 exit 1 fi INITIAL_HOST="$(pick_initial_host)" || { echo "ERROR: Could not reach ${REMOTE_IP_PRIMARY} or ${REMOTE_IP_SECONDARY} for initial setup." >&2 exit 1 } echo "Copying setup script to ${REMOTE_USER}@${INITIAL_HOST}:${REMOTE_SETUP_SCRIPT}" run_scp_to_remote "$LOCAL_SETUP_SCRIPT" "$INITIAL_HOST" "$REMOTE_SETUP_SCRIPT" echo "Running remote setup script on ${INITIAL_HOST} (disconnect is expected during IP/reboot steps)" set +e run_ssh "$INITIAL_HOST" "chmod +x '$REMOTE_SETUP_SCRIPT' && ATVM_TARGET_PASSWORD='${ATVM_TARGET_PASSWORD:-}' bash '$REMOTE_SETUP_SCRIPT' --expected-ip '$EXPECTED_IP_ARG' --expected-hostname '$EXPECTED_HOSTNAME_ARG'" run_status=$? set -e if (( run_status != 0 )); then echo "INFO: Remote run returned non-zero (${run_status}). Continuing because network reconfiguration/reboot can interrupt SSH." fi echo "Waiting for completed client task marker in ${REMOTE_LOG_FILE} (timeout: ${WAIT_TIMEOUT_SECONDS}s)" ACTIVE_HOST="$(wait_for_completed_task)" || { echo "ERROR: Could not detect completed task marker in remote log within timeout." >&2 exit 1 } else echo "Waiting for completed client task marker in ${REMOTE_LOG_FILE} (timeout: ${WAIT_TIMEOUT_SECONDS}s)" ACTIVE_HOST="$(wait_for_completed_task)" || { echo "ERROR: Could not detect completed task marker in remote log within timeout." >&2 exit 1 } fi echo "Host reachable at: ${ACTIVE_HOST}" REMOTE_HOSTNAME="$(run_ssh "$ACTIVE_HOST" "hostname" | tr -d '\r' | tail -n1)" RUN_TS="$(date +%Y%m%d_%H%M%S)" LOCAL_LOG_FILE="${LOCAL_LOG_DIR}/atvm_configuration_${REMOTE_HOSTNAME}_${RUN_TS}.log" echo "Collecting remote log: ${REMOTE_LOG_FILE}" run_scp_from_remote "$ACTIVE_HOST" "$REMOTE_LOG_FILE" "$LOCAL_LOG_FILE" REMOTE_HASH="$(run_ssh "$ACTIVE_HOST" "sha256sum '$REMOTE_LOG_FILE' | awk '{print \$1}'" | tr -d '\r' | tail -n1)" LOCAL_HASH="$(sha256sum "$LOCAL_LOG_FILE" | awk '{print $1}')" if [[ "$REMOTE_HASH" != "$LOCAL_HASH" ]]; then echo "ERROR: Hash mismatch after log copy." >&2 echo "Remote: $REMOTE_HASH" >&2 echo "Local: $LOCAL_HASH" >&2 exit 1 fi HAS_ERRORS_IN_LOG=false # Match only real error log records. Do not match instructional text that mentions "[ERROR]". if run_ssh "$ACTIVE_HOST" "grep -Eq '^\\[ERROR\\]' '$REMOTE_LOG_FILE'"; then HAS_ERRORS_IN_LOG=true fi if [[ "$HAS_ERRORS_IN_LOG" == true ]]; then echo "WARNING: [ERROR] entries detected in remote log. VM will remain powered on for manual inspection." else echo "Log indicates success with no [ERROR] entries. Powering off ${ACTIVE_HOST}." set +e run_ssh "$ACTIVE_HOST" "shutdown -h now" shutdown_status=$? set -e if (( shutdown_status != 0 )); then echo "INFO: Shutdown command returned non-zero (${shutdown_status}); this can occur if SSH disconnects during shutdown." fi echo "Waiting for ${ACTIVE_HOST} to go offline (timeout: ${WAIT_TIMEOUT_SECONDS}s)" if wait_for_host_offline "$ACTIVE_HOST"; then echo "Power-off confirmed: ${ACTIVE_HOST} is offline." else echo "WARNING: Could not confirm ${ACTIVE_HOST} offline within timeout." fi fi echo "Success" echo "Active host: ${ACTIVE_HOST}" echo "Local log: ${LOCAL_LOG_FILE}" echo "SHA256: ${LOCAL_HASH}"