Prevent ATVM watcher and runner log race

This commit is contained in:
2026-04-14 11:13:54 -04:00
parent 7cdcbf8cf1
commit 1c7ed11809
5 changed files with 172 additions and 0 deletions

View File

@@ -47,6 +47,7 @@ From the local workspace:
- `/home/aw/code/cds/atvm/watcher-service/atvm-runner@.service`
- `/home/aw/code/cds/atvm/watcher-service/start-atvm-runner.sh`
- `/home/aw/code/cds/atvm/watcher-service/cancel-atvm-runner.sh`
- `/home/aw/code/cds/atvm/watcher-service/start-atvm-run.sh`
- `/home/aw/code/cds/atvm/watcher-service/atvm-run-watcher@.service`
- `/home/aw/code/cds/atvm/watcher-service/start-atvm-run-watcher.sh`
- `/home/aw/code/cds/atvm/watcher-service/cancel-atvm-run-watcher.sh`
@@ -103,6 +104,7 @@ Recommended permissions:
- `run-atvm-runner.sh`
- `start-atvm-runner.sh`
- `cancel-atvm-runner.sh`
- `start-atvm-run.sh`
- `atvm_run_watcher.py`
- `start-atvm-run-watcher.sh`
- `cancel-atvm-run-watcher.sh`
@@ -149,6 +151,7 @@ mkdir -p /opt/atvm-watcher-service /var/lib/atvm-run-watcher
chmod 755 /opt/atvm-watcher-service/run-atvm-runner.sh
chmod 755 /opt/atvm-watcher-service/start-atvm-runner.sh
chmod 755 /opt/atvm-watcher-service/cancel-atvm-runner.sh
chmod 755 /opt/atvm-watcher-service/start-atvm-run.sh
chmod 755 /opt/atvm-watcher-service/atvm_run_watcher.py
chmod 755 /opt/atvm-watcher-service/start-atvm-run-watcher.sh
chmod 755 /opt/atvm-watcher-service/cancel-atvm-run-watcher.sh
@@ -164,6 +167,10 @@ systemctl cat atvm-run-watcher@.service
python3 /opt/atvm-watcher-service/atvm_run_watcher.py --help
```
```bash
/opt/atvm-watcher-service/start-atvm-run.sh --help
```
```bash
/opt/atvm-watcher-service/start-atvm-runner.sh --help
```
@@ -201,6 +208,21 @@ Example:
--runner-command "python3 ./run-sorry-cypress.py --config_file cypress.atvm-config-gold.ts --build_name e2e-redhat9.6-ubuntu24.04-w2k25-fc --categorize"
```
Preferred combined start:
```bash
/opt/atvm-watcher-service/start-atvm-run.sh \
--build-name e2e-redhat9.6-ubuntu24.04-w2k25-fc \
--template cmc-e2e \
--template-command "python3 ./cmc-templates.py --template_name cmc-e2e --config_file cypress.atvm-config-gold.ts" \
--runner-command "python3 ./run-sorry-cypress.py --config_file cypress.atvm-config-gold.ts --build_name e2e-redhat9.6-ubuntu24.04-w2k25-fc --categorize" \
--config-family gold \
--config-file cypress.atvm-config-gold.ts \
--migration-style "ATVM end-to-end migration validation" \
--integration-plugin "pure with fc" \
--categorize
```
Cancel example:
```bash
@@ -226,6 +248,7 @@ The cancel helper should:
- One runner instance is started per ATVM run.
- One watcher instance is started per ATVM run.
- Prefer the `atvm-runner@...` service over detached SSH background launch patterns for `run-sorry-cypress.py`.
- Prefer `start-atvm-run.sh` over launching watcher and runner separately when both are needed, because it enforces the safe watcher-first order.
- Categorized execution is treated as one watcher instance tracking sequential grouped ATVM sub-runs.
- In categorized execution, the watcher must remain alive until the parent request has actually gone inactive past the grace window, even if one grouped sub-run already completed.
- The watcher exits after the run reaches a terminal state.

View File

@@ -31,6 +31,8 @@ The watcher does not run indefinitely. It is designed for one run per service in
- helper to write per-run runner environment data and start a runner instance
- `cancel-atvm-runner.sh`
- helper to stop a runner instance
- `start-atvm-run.sh`
- wrapper that starts watcher first, waits for it to be active, then starts the runner
- `start-atvm-run-watcher.sh`
- helper to write per-run environment data and start a watcher instance
- `cancel-atvm-run-watcher.sh`
@@ -127,6 +129,21 @@ These helpers write per-run environment files and start the matching instances:
--runner-command "python3 ./run-sorry-cypress.py --config_file cypress.atvm-config-gold.ts --build_name e2e-redhat9.6-ubuntu24.04-w2k25-fc --categorize"
```
Preferred one-shot wrapper:
```bash
./start-atvm-run.sh \
--build-name e2e-redhat9.6-ubuntu24.04-w2k25-fc \
--template cmc-e2e \
--template-command "python3 ./cmc-templates.py --template_name cmc-e2e --config_file cypress.atvm-config-gold.ts" \
--runner-command "python3 ./run-sorry-cypress.py --config_file cypress.atvm-config-gold.ts --build_name e2e-redhat9.6-ubuntu24.04-w2k25-fc --categorize" \
--config-family gold \
--config-file cypress.atvm-config-gold.ts \
--migration-style "ATVM end-to-end migration validation" \
--integration-plugin "pure with fc" \
--categorize
```
That results in:
- state dir:
@@ -159,6 +176,7 @@ Runner cancel example:
- The watcher uses the same ATVM status layout documented in `atvm/docs/automation/status-template.md`.
- Prefer the controller-local `atvm-runner@...` service over ad hoc `nohup` or detached SSH launch patterns for `run-sorry-cypress.py`.
- Prefer `start-atvm-run.sh` when launching both services together because it prevents the watcher/runner log-path race by enforcing watcher-first ordering.
- Kernel values are resolved from `atvm/inventory/vm-inventory.md`.
- Categorized execution is treated as sequential grouped ATVM sub-runs, not as one parent run with internal phases.
- In categorized mode, the watcher writes per-subrun state under `subruns/` and posts each completed grouped run separately.

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
start-atvm-run.sh --build-name <name> --runner-command <text> [options]
Options:
--build-name <name>
--template <name>
--template-command <text>
--runner-command <text>
--config-family <name>
--config-file <path>
--migration-style <text>
--integration-plugin <text>
--extra-option <text> Repeatable
--scope-description <text>
--categorize
--workdir <path> Default: /root/cdc-e2e-cyp-12.17.4
--log-path <path> Default: /tmp/<build-name>.log
--state-root <path> Default: /var/lib/atvm-run-watcher
EOF
}
BUILD_NAME=""
TEMPLATE=""
TEMPLATE_COMMAND=""
RUNNER_COMMAND=""
CONFIG_FAMILY=""
CONFIG_FILE=""
MIGRATION_STYLE=""
INTEGRATION_PLUGIN=""
EXTRA_OPTIONS=()
SCOPE_DESCRIPTION=""
WATCHER_CATEGORIZED="false"
RUNNER_WORKDIR="/root/cdc-e2e-cyp-12.17.4"
RUNNER_LOG=""
STATE_ROOT="/var/lib/atvm-run-watcher"
while [[ $# -gt 0 ]]; do
case "$1" in
--build-name) BUILD_NAME="${2:-}"; shift 2 ;;
--template) TEMPLATE="${2:-}"; shift 2 ;;
--template-command) TEMPLATE_COMMAND="${2:-}"; shift 2 ;;
--runner-command) RUNNER_COMMAND="${2:-}"; shift 2 ;;
--config-family) CONFIG_FAMILY="${2:-}"; shift 2 ;;
--config-file) CONFIG_FILE="${2:-}"; shift 2 ;;
--migration-style) MIGRATION_STYLE="${2:-}"; shift 2 ;;
--integration-plugin) INTEGRATION_PLUGIN="${2:-}"; shift 2 ;;
--extra-option) EXTRA_OPTIONS+=("${2:-}"); shift 2 ;;
--scope-description) SCOPE_DESCRIPTION="${2:-}"; shift 2 ;;
--categorize) WATCHER_CATEGORIZED="true"; shift ;;
--workdir) RUNNER_WORKDIR="${2:-}"; shift 2 ;;
--log-path) RUNNER_LOG="${2:-}"; shift 2 ;;
--state-root) STATE_ROOT="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
esac
done
if [[ -z "$BUILD_NAME" ]]; then
echo "--build-name is required" >&2
usage >&2
exit 1
fi
if [[ -z "$RUNNER_COMMAND" ]]; then
echo "--runner-command is required" >&2
usage >&2
exit 1
fi
if [[ -z "$RUNNER_LOG" ]]; then
RUNNER_LOG="/tmp/${BUILD_NAME}.log"
fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
watcher_cmd=(
"${SCRIPT_DIR}/start-atvm-run-watcher.sh"
--build-name "${BUILD_NAME}"
--template "${TEMPLATE}"
--template-command "${TEMPLATE_COMMAND}"
--runner-command "${RUNNER_COMMAND}"
--config-family "${CONFIG_FAMILY}"
--config-file "${CONFIG_FILE}"
--migration-style "${MIGRATION_STYLE}"
--integration-plugin "${INTEGRATION_PLUGIN}"
--state-root "${STATE_ROOT}"
)
for option in "${EXTRA_OPTIONS[@]}"; do
watcher_cmd+=(--extra-option "${option}")
done
if [[ -n "${SCOPE_DESCRIPTION}" ]]; then
watcher_cmd+=(--scope-description "${SCOPE_DESCRIPTION}")
fi
if [[ "${WATCHER_CATEGORIZED}" == "true" ]]; then
watcher_cmd+=(--categorize)
fi
runner_cmd=(
"${SCRIPT_DIR}/start-atvm-runner.sh"
--build-name "${BUILD_NAME}"
--runner-command "${RUNNER_COMMAND}"
--workdir "${RUNNER_WORKDIR}"
--log-path "${RUNNER_LOG}"
--state-root "${STATE_ROOT}"
)
"${watcher_cmd[@]}"
for _ in {1..15}; do
if systemctl is-active --quiet "atvm-run-watcher@${BUILD_NAME}.service"; then
break
fi
sleep 1
done
if ! systemctl is-active --quiet "atvm-run-watcher@${BUILD_NAME}.service"; then
echo "Watcher service did not become active for ${BUILD_NAME}" >&2
exit 1
fi
"${runner_cmd[@]}"