diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index 3f2848d..978539b 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -47,6 +47,7 @@ Run ATVM CMC automation tests on the designated automation VM without unintended - it must post one final Mattermost status per completed categorized group/sub-run - it must stay active between grouped sub-runs while the parent categorized request is still running - it must not stop after the first grouped run simply because one grouped run completed + - if the child build id label does not match the actual host/spec being executed, report the grouped run using the inferred host-based group instead of the raw child build id label - it must not wait and replace those with one single parent-only post - After execution, report immediate success/failure only. - Do not actively monitor completion unless explicitly requested. diff --git a/atvm/docs/automation/mattermost-watcher-design.md b/atvm/docs/automation/mattermost-watcher-design.md index cceb4f4..54203e1 100644 --- a/atvm/docs/automation/mattermost-watcher-design.md +++ b/atvm/docs/automation/mattermost-watcher-design.md @@ -44,6 +44,7 @@ A categorized run must be treated differently: - then continue watching for the next grouped sub-run - the watcher must remain alive while the parent categorized request or related child Cypress process is still active - one completed grouped sub-run must not be treated as proof that the parent categorized request is finished +- if the child build id label does not match the actual host/spec being executed, the watcher must infer the real group from host execution and use that inferred group for reporting - the watcher must not wait until the very end to send one single parent-only post Evidence sources: diff --git a/atvm/watcher-service/README.md b/atvm/watcher-service/README.md index 90a4a25..41d81e1 100644 --- a/atvm/watcher-service/README.md +++ b/atvm/watcher-service/README.md @@ -128,5 +128,6 @@ This writes a cancellation marker, updates `state.json` to `CANCELLED`, and stop - Kernel values are resolved from `atvm/inventory/vm-inventory.md`. - Categorized execution is treated as sequential grouped ATVM sub-runs, not as one parent run with internal phases. - In categorized mode, the watcher writes per-subrun state under `subruns/` and posts each completed grouped run separately. +- In categorized mode, if the child build id label does not match the host/spec actually being executed, the watcher reports the grouped run using the inferred host-based group name instead of trusting the raw child build id label. - Best-practice controller install path: `/opt/atvm-watcher-service`. - This package is local-only right now. Nothing here is installed on the controller yet. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 2895e96..4ffe5c6 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -553,6 +553,38 @@ def infer_group_label(hosts: List[str], index: int) -> str: return "-".join(labels) if labels else f"group{index}" +def infer_group_from_host(host: str) -> str: + short = host.split("-", 1)[-1].lower() + if short.startswith("w2k"): + return "windows" + if short.startswith("amazonlinux"): + return "amazonlinux" + if short.startswith("centos"): + return "centos" + if short.startswith("ubuntu"): + return "ubuntu" + if short.startswith("rocky"): + return "rocky" + if short.startswith("redhat"): + return "redhat" + if short.startswith("oracle"): + return "oracle" + if short.startswith("fedora"): + return "fedora" + if short.startswith("debian"): + return "debian" + if short.startswith("suse"): + return "suse" + return short or "group" + + +def corrected_categorized_display_name(raw_display_name: str, hosts: List[str]) -> str: + if not hosts: + return raw_display_name + group = infer_group_from_host(hosts[0]) + return re.sub(r"-(amazonlinux|centos|ubuntu|rocky|redhat|oracle|fedora|debian|suse|windows|w2k)-batch", f"-{group}-batch", raw_display_name) + + def extract_segment_build_name(segment_text: str, parent_build_name: str) -> Optional[str]: patterns = [ rf"({re.escape(parent_build_name)}-[A-Za-z0-9_.-]*batch\d+_\d+)", @@ -702,8 +734,8 @@ def discover_categorized_subruns( xml_mtime = datetime.fromtimestamp(xml_path.stat().st_mtime, tz=timezone.utc) if xml_mtime < started_at: continue - display_name = xml_path.stem[len("test-result-"):] - discovered_builds.append(display_name) + raw_display_name = xml_path.stem[len("test-result-"):] + discovered_builds.append(raw_display_name) parsed = parse_host_xml(xml_path) host_results: Dict[str, HostResult] = {} if parsed: @@ -721,13 +753,16 @@ def discover_categorized_subruns( state = "RUNNING" if cancelled: state = "CANCELLED" - elif check_ts or display_name != current_subrun_build or not parent_active: + elif check_ts or raw_display_name != current_subrun_build or not parent_active: state = "FAILED" if any(result.failures for result in host_results.values()) else "COMPLETED" + display_name = corrected_categorized_display_name(raw_display_name, list(host_results)) notes = [f"Categorized sub-run discovered from reporter file `{xml_path.name}`."] if check_ts: notes.append("Final `check-xml-files.ts` validation passed.") if summary and host_results: notes.append("Host result details were derived from the parent categorized run log summary.") + if display_name != raw_display_name: + notes.append(f"Child build id was reported as `{raw_display_name}`, but the actual grouped run was inferred from host execution as `{display_name}`.") if cancelled: notes.append("Cancellation marker detected.") end_ts = check_ts or next((result.timestamp for result in host_results.values() if result.timestamp), xml_mtime) @@ -766,16 +801,22 @@ def discover_categorized_subruns( status="RUN", detail="in progress", ) + display_name = corrected_categorized_display_name(current_subrun_build, [current_host] if current_host else []) + notes = ["Active categorized sub-run inferred from live `--ci-build-id` process state."] + if display_name != current_subrun_build: + notes.append(f"Child build id is `{current_subrun_build}`, but the actual grouped run was inferred from host execution as `{display_name}`.") + if cancelled: + notes.append("Cancellation marker detected.") subrun_states.append( { "key": sanitize_key(current_subrun_build), - "display_name": current_subrun_build, + "display_name": display_name, "state": "CANCELLED" if cancelled else "RUNNING", "host_results": host_results, "start_ts": started_at, "end_ts": None, "currents_url": None, - "notes": ["Active categorized sub-run inferred from live `--ci-build-id` process state."] + (["Cancellation marker detected."] if cancelled else []), + "notes": notes, } )