Fix categorized watcher subrun host recovery

This commit is contained in:
2026-03-27 15:59:04 -04:00
parent 650adb085e
commit 20c9ba7178
2 changed files with 74 additions and 2 deletions

View File

@@ -275,6 +275,14 @@ This file stores run-specific examples only when a run produced a new learning r
- Reserve `NOTES:` for meaningful operator-facing content such as the Currents run URL, real anomalies, failure context, and important fallback behavior. - Reserve `NOTES:` for meaningful operator-facing content such as the Currents run URL, real anomalies, failure context, and important fallback behavior.
- Do not include generic artifact-detection confirmations in the posted `NOTES:` section. - Do not include generic artifact-detection confirmations in the posted `NOTES:` section.
## Run Learning: 2026-03-27 (Categorized grouped XML may need host recovery from the subrun's per-host artifact)
- Observed failure mode:
- A categorized subrun can finish and write its grouped `test-result-<build>.xml`, but that XML may only contain `check-xml-files.ts`.
- In that case the watcher may know the grouped batch completed and even know its Currents URL, but still miss the host result unless it recovers the host from the matching per-host reporter artifact.
- Action for future runs:
- For categorized runs, when grouped XML only shows `check-xml-files.ts`, infer the subrun host from the categorized build id and recover the result from the latest matching per-host reporter artifact within the grouped completion window.
- Do not keep a completed grouped subrun in `RUNNING` just because the grouped XML lacked a host testcase entry.
## Run Learning: 2026-03-27 (Default ATVM approval should include the watcher) ## Run Learning: 2026-03-27 (Default ATVM approval should include the watcher)
- Observed requirement: - Observed requirement:
- The operator wants `approve` to mean run with watcher by default. - The operator wants `approve` to mean run with watcher by default.

View File

@@ -514,6 +514,56 @@ def collect_latest_host_reporter_artifact(
return latest return latest
def collect_latest_group_host_reporter_artifact(
reporter_root: Path,
group_label: Optional[str],
kernels: Dict[str, str],
run_started_at: datetime,
run_ended_at: Optional[datetime] = None,
) -> Optional[Tuple[str, HostResult]]:
if not group_label:
return None
logs_dir = reporter_root / "logs"
if not logs_dir.exists():
return None
latest: Optional[Tuple[str, HostResult]] = None
for host_dir in sorted(logs_dir.iterdir()):
if not host_dir.is_dir():
continue
host = host_dir.name
if not host.startswith("atvm"):
continue
if infer_group_from_host(host) != group_label:
continue
for artifact_path in sorted(host_dir.iterdir()):
if artifact_path.suffix not in {".txt", ".json"}:
continue
artifact_mtime = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc)
if artifact_mtime < run_started_at:
continue
if run_ended_at and artifact_mtime >= run_ended_at:
continue
result = HostResult(
host=host,
kernel=kernels.get(host, "unknown"),
status="PASS",
detail="completed",
timestamp=artifact_mtime,
)
candidate = (host, result)
if latest is None:
latest = candidate
continue
latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc)
if artifact_mtime >= latest_ts:
latest = candidate
return latest
def find_check_xml_end( def find_check_xml_end(
reporter_root: Path, reporter_root: Path,
started_at: datetime, started_at: datetime,
@@ -1003,17 +1053,29 @@ def discover_categorized_subruns(
completed_hosts.append(host) completed_hosts.append(host)
check_ts = extract_check_xml_timestamp_from_file(xml_path) check_ts = extract_check_xml_timestamp_from_file(xml_path)
summary = completed_summaries[current_summary_index] if current_summary_index < len(completed_summaries) else None summary = completed_summaries[current_summary_index] if current_summary_index < len(completed_summaries) else None
inferred_host = infer_host_from_subrun_build(raw_display_name, expected_hosts, completed_hosts)
if summary and (not host_results or all(result.host == "check-xml-files" for result in host_results.values())): if summary and (not host_results or all(result.host == "check-xml-files" for result in host_results.values())):
host_results = summary["host_results"] host_results = summary["host_results"]
completed_hosts.extend([host for host in host_results if host not in completed_hosts]) completed_hosts.extend([host for host in host_results if host not in completed_hosts])
if not host_results and check_ts: if not host_results and check_ts:
latest_host = collect_latest_host_result( scoped_expected_hosts = [inferred_host] if inferred_host else expected_hosts
latest_host = collect_latest_host_reporter_artifact(
reporter_root=reporter_root, reporter_root=reporter_root,
expected_hosts=expected_hosts, expected_hosts=scoped_expected_hosts,
kernels=inventory, kernels=inventory,
run_started_at=started_at, run_started_at=started_at,
run_ended_at=check_ts + timedelta(seconds=5), run_ended_at=check_ts + timedelta(seconds=5),
) )
if not latest_host:
display_group_match = re.search(r"-(amazonlinux|centos|ubuntu|rocky|redhat|oracle|fedora|debian|suse|windows)-batch", raw_display_name)
display_group = display_group_match.group(1) if display_group_match else None
latest_host = collect_latest_group_host_reporter_artifact(
reporter_root=reporter_root,
group_label=display_group,
kernels=inventory,
run_started_at=started_at,
run_ended_at=check_ts + timedelta(seconds=5),
)
if latest_host: if latest_host:
host, result = latest_host host, result = latest_host
host_results = {host: result} host_results = {host: result}
@@ -1042,6 +1104,8 @@ def discover_categorized_subruns(
notes.append("Host result details were derived from the parent categorized run log summary.") notes.append("Host result details were derived from the parent categorized run log summary.")
elif host_results and check_ts: elif host_results and check_ts:
notes.append("Host result details were derived from the latest matching host reporter artifact written before grouped finalization.") notes.append("Host result details were derived from the latest matching host reporter artifact written before grouped finalization.")
if inferred_host:
notes.append(f"Grouped sub-run host scope was inferred as `{inferred_host}` from the categorized build id.")
elif check_ts and not host_results and parent_active: elif check_ts and not host_results and parent_active:
notes.append("Grouped reporter XML arrived before the parent run log exposed the final host summary; waiting to post until host details are available.") notes.append("Grouped reporter XML arrived before the parent run log exposed the final host summary; waiting to post until host details are available.")
if display_name != raw_display_name: if display_name != raw_display_name: