atvm: fail runs explicitly on hang-kill and runner non-zero exit

This commit is contained in:
2026-05-07 13:34:37 -04:00
parent e3497111dd
commit 65330ee9f8
3 changed files with 65 additions and 0 deletions

View File

@@ -63,6 +63,8 @@ Run ATVM CMC automation tests on the designated automation VM without unintended
- If monitoring is requested, allow long runtime windows (15-30+ minutes) and continue until completion unless operator instructs otherwise. - If monitoring is requested, allow long runtime windows (15-30+ minutes) and continue until completion unless operator instructs otherwise.
- Report command errors immediately. - Report command errors immediately.
- `sshpass` may be used where password-based SSH automation is required. - `sshpass` may be used where password-based SSH automation is required.
- Treat runner hang-kill events (`Sending SIGKILL ... due to no change` / `Max hang retries reached`) as explicit `FAILED` outcomes, not `RUNNING` or ambiguous termination.
- For manual `run-sorry-cypress.py` execution, treat `ATVM_HANG_FAIL ...` log markers and `/tmp/atvm-runner-state-<build>.json` terminal state files as the source of truth for hang-failure terminal status.
## Core Scripts ## Core Scripts
- Template prep: `/root/cdc-e2e-cyp-12.17.4/cmc-templates.py` - Template prep: `/root/cdc-e2e-cyp-12.17.4/cmc-templates.py`

View File

@@ -33,6 +33,20 @@ This file stores run-specific examples only when a run produced a new learning r
- For Windows-involved ATVM automation runs, add `--hang_retries 0` to `run-sorry-cypress.py` by default unless the operator explicitly requests a different value. - For Windows-involved ATVM automation runs, add `--hang_retries 0` to `run-sorry-cypress.py` by default unless the operator explicitly requests a different value.
- Keep this as an operator-default behavior even though the underlying runner option is generic and not Windows-only in code. - Keep this as an operator-default behavior even though the underlying runner option is generic and not Windows-only in code.
## Run Learning: 2026-05-07 (Treat hang-kill as explicit failure)
- Observed failure mode:
- A run can stall long enough for `run-sorry-cypress.py` to force-kill Cypress (`Sending SIGKILL ... due to no change`) and still be reported as an ambiguous terminated state.
- Action for future runs:
- When run logs contain hang-kill markers (`Sending SIGKILL ... due to no change` and `Max hang retries reached.`), classify the run as `FAILED`.
- When the runner service exits non-zero, classify the run as `FAILED` instead of generic terminated.
## Run Learning: 2026-05-07 (Manual runner emits explicit hang-fail markers and terminal state)
- Observed failure mode:
- Manual `run-sorry-cypress.py` execution can appear "still running" after hang-kill handling because failure state was not emitted in a machine-readable terminal marker.
- Action for future runs:
- `run-sorry-cypress.py` now emits `ATVM_HANG_FAIL ...` on hang-kill paths and writes terminal state JSON under `/tmp/atvm-runner-state-<build>.json`.
- Max hang-retry exhaustion now writes terminal failure state before exiting non-zero, including categorized and non-categorized flows.
## Run Learning: 2026-05-02 (Do not reuse the previous controller status check for a new ATVM request) ## Run Learning: 2026-05-02 (Do not reuse the previous controller status check for a new ATVM request)
- Observed failure mode: - Observed failure mode:
- A later ATVM run request was blocked because the assistant reused the immediately previous controller status result instead of performing a fresh live running-state check at request time. - A later ATVM run request was blocked because the assistant reused the immediately previous controller status result instead of performing a fresh live running-state check at request time.

View File

@@ -196,6 +196,41 @@ def run_ps() -> str:
return proc.stdout return proc.stdout
def read_runner_service_status(build_name: str) -> Tuple[Optional[str], Optional[int], Optional[str]]:
unit = f"atvm-runner@{build_name}.service"
try:
proc = subprocess.run(
["systemctl", "show", unit, "--property=Result,ExecMainStatus,ActiveState", "--value"],
capture_output=True,
text=True,
check=True,
timeout=5,
)
except Exception:
return None, None, None
lines = [line.strip() for line in proc.stdout.splitlines()]
if len(lines) < 3:
return None, None, None
result = lines[0] or None
exec_main_status: Optional[int] = None
if lines[1]:
try:
exec_main_status = int(lines[1])
except ValueError:
exec_main_status = None
active_state = lines[2] or None
return result, exec_main_status, active_state
def detect_hang_kill(log_text: str) -> bool:
markers = (
"Sending SIGKILL to cy2 command process group due to no change",
"Max hang retries reached.",
)
return any(marker in log_text for marker in markers)
def normalize_logged_command(raw: str, command_name: str) -> Optional[str]: def normalize_logged_command(raw: str, command_name: str) -> Optional[str]:
patterns = { patterns = {
"cmc-templates.py": r"((?:python3?\s+)?(?:\./)?cmc-templates\.py\b.*)", "cmc-templates.py": r"((?:python3?\s+)?(?:\./)?cmc-templates\.py\b.*)",
@@ -2081,6 +2116,8 @@ def determine_state(
start_ts = min(parent_start_candidates) if parent_start_candidates else started_at start_ts = min(parent_start_candidates) if parent_start_candidates else started_at
end_ts = max(parent_end_candidates) if parent_end_candidates else find_check_xml_end(reporter_root, started_at) end_ts = max(parent_end_candidates) if parent_end_candidates else find_check_xml_end(reporter_root, started_at)
currents_url = extract_currents_url(log_text) or latest_currents_url(build_dir) currents_url = extract_currents_url(log_text) or latest_currents_url(build_dir)
hang_kill_detected = detect_hang_kill(log_text)
runner_result, runner_exit_code, runner_active_state = read_runner_service_status(build_name)
if cancelled: if cancelled:
notes.append("Cancellation marker detected.") notes.append("Cancellation marker detected.")
@@ -2093,6 +2130,11 @@ def determine_state(
return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if hang_kill_detected:
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds: if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds:
notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.") notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.")
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
@@ -2103,6 +2145,13 @@ def determine_state(
return state, subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return state, subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds: if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
if runner_active_state:
detail += f"; state={runner_active_state}"
notes.append(detail + ".")
notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
notes.append("Run process exited without a clean completion signal.") notes.append("Run process exited without a clean completion signal.")
return "TERMINATED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "TERMINATED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes