From 8743ad122ce58dd8cb7b8bd7958385edee0f5cec Mon Sep 17 00:00:00 2001 From: Eric Shi Date: Sat, 10 Jan 2026 18:46:40 -0800 Subject: [PATCH 1/2] Add visible re-run instructions when GPU jobs fail When tests or benchmarks fail, display warnings explaining that "Re-run failed jobs" won't work due to ephemeral EC2 runners. - Error annotations appear at top of job view - Job summary table in the Summary tab - ASCII banner in log output Signed-off-by: Eric Shi --- .github/workflows/aws_gpu_benchmarks.yml | 38 ++++++++++++++++++++++++ .github/workflows/aws_gpu_tests.yml | 38 ++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/.github/workflows/aws_gpu_benchmarks.yml b/.github/workflows/aws_gpu_benchmarks.yml index c95b44a7f..0cfdec5dd 100644 --- a/.github/workflows/aws_gpu_benchmarks.yml +++ b/.github/workflows/aws_gpu_benchmarks.yml @@ -131,6 +131,44 @@ jobs: uvx asv compare --split ${{ inputs.base_ref }} ${{ inputs.ref }} exit 2 + - name: Re-run instructions + if: failure() + run: | + # Create error annotations (appear at top of job summary) + echo "::error::DO NOT use 'Re-run failed jobs' - the EC2 runner no longer exists and your job will be queued forever." + echo "::error::USE 'Re-run all jobs' instead to start a fresh EC2 runner." + + # Write to job summary (appears in Summary tab) + cat >> "$GITHUB_STEP_SUMMARY" << 'EOF' + ## ⚠️ How to Re-run This Workflow + + This workflow uses **ephemeral EC2 runners** that are terminated after each run. + + | | Option | Result | + |---|--------|--------| + | ❌ | **Re-run failed jobs** | Runner no longer exists → job queued forever | + | ✅ | **Re-run all jobs** | Starts new EC2 runner → benchmarks re-run | + EOF + + # Also print to log for completeness + cat << 'EOF' + + ================================================================================ + ⚠️ IMPORTANT: HOW TO RE-RUN THIS WORKFLOW + ================================================================================ + + This workflow uses ephemeral EC2 runners that are terminated after each run. + + ❌ DO NOT select "Re-run failed jobs" + → The runner no longer exists and your job will be queued forever. + + ✅ DO select "Re-run all jobs" + → This will start a new EC2 runner and re-run the benchmarks. + + ================================================================================ + + EOF + stop-runner: name: Stop self-hosted EC2 runner runs-on: ubuntu-latest diff --git a/.github/workflows/aws_gpu_tests.yml b/.github/workflows/aws_gpu_tests.yml index a7c554ca6..34becdf1d 100644 --- a/.github/workflows/aws_gpu_tests.yml +++ b/.github/workflows/aws_gpu_tests.yml @@ -118,6 +118,44 @@ jobs: flags: unittests token: ${{ secrets.CODECOV_TOKEN }} + - name: Re-run instructions + if: failure() + run: | + # Create error annotations (appear at top of job summary) + echo "::error::DO NOT use 'Re-run failed jobs' - the EC2 runner no longer exists and your job will be queued forever." + echo "::error::USE 'Re-run all jobs' instead to start a fresh EC2 runner." + + # Write to job summary (appears in Summary tab) + cat >> "$GITHUB_STEP_SUMMARY" << 'EOF' + ## ⚠️ How to Re-run This Workflow + + This workflow uses **ephemeral EC2 runners** that are terminated after each run. + + | | Option | Result | + |---|--------|--------| + | ❌ | **Re-run failed jobs** | Runner no longer exists → job queued forever | + | ✅ | **Re-run all jobs** | Starts new EC2 runner → tests re-run | + EOF + + # Also print to log for completeness + cat << 'EOF' + + ================================================================================ + ⚠️ IMPORTANT: HOW TO RE-RUN THIS WORKFLOW + ================================================================================ + + This workflow uses ephemeral EC2 runners that are terminated after each run. + + ❌ DO NOT select "Re-run failed jobs" + → The runner no longer exists and your job will be queued forever. + + ✅ DO select "Re-run all jobs" + → This will start a new EC2 runner and re-run the tests. + + ================================================================================ + + EOF + stop-runner: name: Stop self-hosted EC2 runner runs-on: ubuntu-latest From d18a324165f913e47e4002e16788b36dea5ea387 Mon Sep 17 00:00:00 2001 From: Eric Shi Date: Sat, 10 Jan 2026 18:49:32 -0800 Subject: [PATCH 2/2] Prevent Push - AWS GPU workflow from running in forks Signed-off-by: Eric Shi --- .github/workflows/push_aws_gpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/push_aws_gpu.yml b/.github/workflows/push_aws_gpu.yml index d9c7f11dd..0a7170619 100644 --- a/.github/workflows/push_aws_gpu.yml +++ b/.github/workflows/push_aws_gpu.yml @@ -13,5 +13,6 @@ on: jobs: run-tests: + if: github.repository == 'newton-physics/newton' uses: ./.github/workflows/aws_gpu_tests.yml secrets: inherit