From b71418aa97c601c4049439b978e8de38eb41c34f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Mar 2026 20:26:10 -0400
Subject: [PATCH 1/2] ci: clean stale .out files after checkout to prevent
 misleading logs

When a multi-step CI job (like case-optimization) fails at an early
step, the 'Print Logs' step (if: always) would cat output files from
a previous successful run, making it appear the current run succeeded.
Delete stale .out files after checkout so logs only show output from
the current workflow run.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test.yml | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a52a5967d1..53066b1fba 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -234,6 +234,9 @@ jobs:
           # submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry.
           clean: false
 
+      - name: Clean stale output files
+        run:  rm -f *.out
+
       - name: Build (login node)
         if:   matrix.cluster != 'phoenix'
         timeout-minutes: 60
@@ -317,6 +320,9 @@ jobs:
         with:
           clean: false
 
+      - name: Clean stale output files
+        run:  rm -f *.out
+
       - name: Pre-Build (SLURM)
         if:   matrix.cluster == 'phoenix'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

From 3d37ae29f67981188e04957897084c62f6bc5ee8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Mar 2026 21:13:40 -0400
Subject: [PATCH 2/2] ci: run benchmark monitors sequentially to avoid login
 node OOM kills

Phoenix login nodes have a 4 GB per-user cgroup memory limit shared
across all runner processes. Running two benchmark monitors in parallel
(each with tail -f, bash loops, and pipe subshells) on top of 7
concurrent runner processes exceeds this limit, triggering OOM kills.

Submit both PR and master SLURM jobs up front so they run concurrently
on compute nodes (preserving benchmark fairness), but monitor them
one at a time on the login node. Also add SUBMIT_ONLY mode to
submit-slurm-job.sh to support decoupling submission from monitoring.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 128 ++++++++++++---------
 .github/scripts/submit-slurm-job.sh        |   8 +-
 2 files changed, 80 insertions(+), 56 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index b6a6034c3c..8fcf5f56a1 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-# Run PR and master benchmarks in parallel and verify outputs
+# Run PR and master benchmarks and verify outputs.
+# Both SLURM jobs are submitted up front so they run concurrently on
+# compute nodes (fair comparison under the same cluster load), but
+# monitoring happens sequentially to stay within the per-user cgroup
+# memory limit on login nodes (4 GB on Phoenix shared by 7 runners).
 # Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
 
 set -euo pipefail
@@ -17,88 +21,104 @@ cluster="$3"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 echo "=========================================="
-echo "Starting parallel benchmark jobs..."
+echo "Starting benchmark jobs..."
 echo "=========================================="
 
-# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
-# both parallel jobs so PR and master always land on the same GPU type.
+# For Phoenix GPU benchmarks, select a consistent GPU partition so PR and
+# master always land on the same GPU type.
 if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     echo "Selecting Phoenix GPU partition for benchmark consistency..."
-    # Require 2 nodes so both PR and master jobs can run concurrently.
+    # Require 2 nodes so both jobs can run concurrently on compute.
     GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
     BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
     export BENCH_GPU_PARTITION
 fi
 
-# Run both jobs with monitoring using dedicated script from PR
-# Use stdbuf for line-buffered output and prefix each line for clarity
-(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
-pr_pid=$!
-echo "PR job started in background (PID: $pr_pid)"
-
-(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
-master_pid=$!
-echo "Master job started in background (PID: $master_pid)"
-
-echo "Waiting for both jobs to complete..."
-
-# Wait and capture exit codes reliably.
-# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
-# (which would orphan the second job).
+# The bench script must come from the PR tree (master may not have it).
+PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh"
+# Must match the slug computed by submit-slurm-job.sh:
+#   basename("bench.sh") → "bench" → "bench-${device}-${interface}"
+job_slug="bench-${device}-${interface}"
+
+# --- Phase 1: Submit both SLURM jobs (no monitoring yet) ---
+echo "Submitting PR benchmark..."
+(cd pr && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
+pr_job_id=$(cat "pr/${job_slug}.slurm_job_id")
+echo "PR job submitted: $pr_job_id"
+
+echo "Submitting master benchmark..."
+(cd master && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
+master_job_id=$(cat "master/${job_slug}.slurm_job_id")
+echo "Master job submitted: $master_job_id"
+
+echo "Both SLURM jobs submitted — running concurrently on compute nodes."
+echo "Monitoring sequentially to conserve login node memory."
+
+# --- Phase 2: Monitor sequentially (one at a time on login node) ---
+echo ""
+echo "=== Monitoring PR job $pr_job_id ==="
 pr_exit=0
-master_exit=0
-
-wait "$pr_pid" || pr_exit=$?
+bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out" || pr_exit=$?
 if [ "$pr_exit" -ne 0 ]; then
-  echo "PR job exited with code: $pr_exit"
-  echo "Last 50 lines of PR job log:"
-  tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
+    echo "PR job exited with code: $pr_exit"
+    tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo "  Could not read PR log"
 else
-  echo "PR job completed successfully"
+    echo "PR job completed successfully"
 fi
 
-wait "$master_pid" || master_exit=$?
+echo ""
+echo "=== Monitoring master job $master_job_id ==="
+master_exit=0
+bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$master_job_id" "master/${job_slug}.out" || master_exit=$?
 if [ "$master_exit" -ne 0 ]; then
-  echo "Master job exited with code: $master_exit"
-  echo "Last 50 lines of master job log:"
-  tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
+    echo "Master job exited with code: $master_exit"
+    tail -n 50 "master/${job_slug}.out" 2>/dev/null || echo "  Could not read master log"
 else
-  echo "Master job completed successfully"
+    echo "Master job completed successfully"
 fi
 
-# Warn if either job failed (partial results may still be usable)
+# --- Phase 3: Verify outputs ---
 if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
-  echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
-  echo "Checking for partial results..."
+    echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
+    echo "Checking for partial results..."
 else
-  echo "=========================================="
-  echo "Both benchmark jobs completed successfully!"
-  echo "=========================================="
+    echo "=========================================="
+    echo "Both benchmark jobs completed successfully!"
+    echo "=========================================="
 fi
 
-# Final verification that output files exist before proceeding
-pr_yaml="pr/bench-${device}-${interface}.yaml"
-master_yaml="master/bench-${device}-${interface}.yaml"
+pr_yaml="pr/${job_slug}.yaml"
+master_yaml="master/${job_slug}.yaml"
+
+# Wait briefly for YAML files to appear on NFS.  When monitoring starts
+# after a job has already completed (common for the second job), the
+# recovery path in run_monitored_slurm_job.sh sleeps 30s, but NFS
+# propagation can take longer under load.
+for yaml in "$pr_yaml" "$master_yaml"; do
+    attempts=0
+    while [ ! -f "$yaml" ] && [ $attempts -lt 6 ]; do
+        echo "Waiting for $yaml to appear (NFS propagation)..."
+        sleep 5
+        attempts=$((attempts + 1))
+    done
+done
 
 if [ ! -f "$pr_yaml" ]; then
-  echo "ERROR: PR benchmark output not found: $pr_yaml"
-  ls -la pr/ || true
-  echo ""
-  echo "Last 100 lines of PR log:"
-  tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
-  exit 1
+    echo "ERROR: PR benchmark output not found: $pr_yaml"
+    ls -la pr/ || true
+    echo ""
+    tail -n 100 "pr/${job_slug}.out" 2>/dev/null || echo "  Could not read PR log"
+    exit 1
 fi
 
 if [ ! -f "$master_yaml" ]; then
-  echo "ERROR: Master benchmark output not found: $master_yaml"
-  ls -la master/ || true
-  echo ""
-  echo "Last 100 lines of master log:"
-  tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
-  exit 1
+    echo "ERROR: Master benchmark output not found: $master_yaml"
+    ls -la master/ || true
+    echo ""
+    tail -n 100 "master/${job_slug}.out" 2>/dev/null || echo "  Could not read master log"
+    exit 1
 fi
 
 echo "Verified both YAML files exist:"
 echo "  - $pr_yaml"
 echo "  - $master_yaml"
-
diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh
index 78dd1ee9a2..231205556d 100755
--- a/.github/scripts/submit-slurm-job.sh
+++ b/.github/scripts/submit-slurm-job.sh
@@ -200,5 +200,9 @@ echo "Submitted batch job $job_id"
 echo "$job_id" > "$id_file"
 echo "Job ID written to $id_file"
 
-# --- Monitor ---
-bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
+# --- Monitor (skip if SUBMIT_ONLY=1, e.g. for parallel submission) ---
+if [ "${SUBMIT_ONLY:-0}" = "1" ]; then
+    echo "SUBMIT_ONLY mode: skipping monitor (job_id=$job_id output=$output_file)"
+else
+    bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
+fi