atvm: fail runs explicitly on hang-kill and runner non-zero exit
This commit is contained in:
@@ -63,6 +63,8 @@ Run ATVM CMC automation tests on the designated automation VM without unintended
|
||||
- If monitoring is requested, allow long runtime windows (15-30+ minutes) and continue until completion unless operator instructs otherwise.
|
||||
- Report command errors immediately.
|
||||
- `sshpass` may be used where password-based SSH automation is required.
|
||||
- Treat runner hang-kill events (`Sending SIGKILL ... due to no change` / `Max hang retries reached`) as explicit `FAILED` outcomes, not `RUNNING` or ambiguous termination.
|
||||
- For manual `run-sorry-cypress.py` execution, treat `ATVM_HANG_FAIL ...` log markers and `/tmp/atvm-runner-state-<build>.json` terminal state files as the source of truth for hang-failure terminal status.
|
||||
|
||||
## Core Scripts
|
||||
- Template prep: `/root/cdc-e2e-cyp-12.17.4/cmc-templates.py`
|
||||
|
||||
@@ -33,6 +33,20 @@ This file stores run-specific examples only when a run produced a new learning r
|
||||
- For Windows-involved ATVM automation runs, add `--hang_retries 0` to `run-sorry-cypress.py` by default unless the operator explicitly requests a different value.
|
||||
- Keep this as an operator-default behavior even though the underlying runner option is generic and not Windows-only in code.
|
||||
|
||||
## Run Learning: 2026-05-07 (Treat hang-kill as explicit failure)
|
||||
- Observed failure mode:
|
||||
- A run can stall long enough for `run-sorry-cypress.py` to force-kill Cypress (`Sending SIGKILL ... due to no change`) and still be reported as an ambiguous terminated state.
|
||||
- Action for future runs:
|
||||
- When run logs contain hang-kill markers (`Sending SIGKILL ... due to no change` and `Max hang retries reached.`), classify the run as `FAILED`.
|
||||
- When the runner service exits non-zero, classify the run as `FAILED` instead of generic terminated.
|
||||
|
||||
## Run Learning: 2026-05-07 (Manual runner emits explicit hang-fail markers and terminal state)
|
||||
- Observed failure mode:
|
||||
- Manual `run-sorry-cypress.py` execution can appear "still running" after hang-kill handling because failure state was not emitted in a machine-readable terminal marker.
|
||||
- Action for future runs:
|
||||
- `run-sorry-cypress.py` now emits `ATVM_HANG_FAIL ...` on hang-kill paths and writes terminal state JSON under `/tmp/atvm-runner-state-<build>.json`.
|
||||
- Max hang-retry exhaustion now writes terminal failure state before exiting non-zero, including categorized and non-categorized flows.
|
||||
|
||||
## Run Learning: 2026-05-02 (Do not reuse the previous controller status check for a new ATVM request)
|
||||
- Observed failure mode:
|
||||
- A later ATVM run request was blocked because the assistant reused the immediately previous controller status result instead of performing a fresh live running-state check at request time.
|
||||
|
||||
@@ -196,6 +196,41 @@ def run_ps() -> str:
|
||||
return proc.stdout
|
||||
|
||||
|
||||
def read_runner_service_status(build_name: str) -> Tuple[Optional[str], Optional[int], Optional[str]]:
|
||||
unit = f"atvm-runner@{build_name}.service"
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["systemctl", "show", unit, "--property=Result,ExecMainStatus,ActiveState", "--value"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
return None, None, None
|
||||
|
||||
lines = [line.strip() for line in proc.stdout.splitlines()]
|
||||
if len(lines) < 3:
|
||||
return None, None, None
|
||||
result = lines[0] or None
|
||||
exec_main_status: Optional[int] = None
|
||||
if lines[1]:
|
||||
try:
|
||||
exec_main_status = int(lines[1])
|
||||
except ValueError:
|
||||
exec_main_status = None
|
||||
active_state = lines[2] or None
|
||||
return result, exec_main_status, active_state
|
||||
|
||||
|
||||
def detect_hang_kill(log_text: str) -> bool:
|
||||
markers = (
|
||||
"Sending SIGKILL to cy2 command process group due to no change",
|
||||
"Max hang retries reached.",
|
||||
)
|
||||
return any(marker in log_text for marker in markers)
|
||||
|
||||
|
||||
def normalize_logged_command(raw: str, command_name: str) -> Optional[str]:
|
||||
patterns = {
|
||||
"cmc-templates.py": r"((?:python3?\s+)?(?:\./)?cmc-templates\.py\b.*)",
|
||||
@@ -2081,6 +2116,8 @@ def determine_state(
|
||||
start_ts = min(parent_start_candidates) if parent_start_candidates else started_at
|
||||
end_ts = max(parent_end_candidates) if parent_end_candidates else find_check_xml_end(reporter_root, started_at)
|
||||
currents_url = extract_currents_url(log_text) or latest_currents_url(build_dir)
|
||||
hang_kill_detected = detect_hang_kill(log_text)
|
||||
runner_result, runner_exit_code, runner_active_state = read_runner_service_status(build_name)
|
||||
|
||||
if cancelled:
|
||||
notes.append("Cancellation marker detected.")
|
||||
@@ -2093,6 +2130,11 @@ def determine_state(
|
||||
return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
if hang_kill_detected:
|
||||
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
|
||||
notes.append("Treating this as explicit failure.")
|
||||
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds:
|
||||
notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.")
|
||||
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
@@ -2103,6 +2145,13 @@ def determine_state(
|
||||
return state, subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
|
||||
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
|
||||
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
|
||||
if runner_active_state:
|
||||
detail += f"; state={runner_active_state}"
|
||||
notes.append(detail + ".")
|
||||
notes.append("Treating this as explicit failure.")
|
||||
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
notes.append("Run process exited without a clean completion signal.")
|
||||
return "TERMINATED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
|
||||
Reference in New Issue
Block a user