Skip to content

Commit 9da965e

Browse files
Merge branch 'main' into dev/kpietkun/tests_custom_op_correctness
2 parents e9e95fa + c43ec9c commit 9da965e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3087
-402
lines changed

.cd/Dockerfile.rhel.tenc.pytorch.vllm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ RUN \
6161
git fetch upstream --tags || true && \
6262
git checkout ${VLLM_PROJECT_COMMIT} && \
6363
# Install vllm-project/vllm
64-
bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \
64+
bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \
6565
VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
6666
# Install vllm-gaudi plugin
6767
cd $VLLM_PATH2 && \

.cd/Dockerfile.ubuntu.pytorch.vllm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ RUN \
5151
git fetch upstream --tags || true && \
5252
git checkout ${VLLM_PROJECT_COMMIT} && \
5353
# Install vllm-project/vllm
54-
bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \
54+
bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \
5555
VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
5656
# Install vllm-gaudi plugin
5757
cd $VLLM_PATH2 && \

.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ RUN \
4545
git remote add upstream https://github.com/vllm-project/vllm.git && \
4646
git fetch upstream --tags || true && \
4747
git checkout ${VLLM_COMMIT_HASH} && \
48-
pip install -r <(sed '/^[torch]/d' requirements/build.txt) && \
48+
pip install -r <(sed '/^torch/d' requirements/build.txt) && \
4949
VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
5050
\
5151
# Install vllm-gaudi

.github/actionlint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ self-hosted-runner:
22
labels:
33
- ucb-vllm-cicd-g2
44
- hourly-ci
5+
- pr-ci

.github/workflows/create-release-branch.yaml

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,27 @@ jobs:
120120
echo "branch_name=pre_${{ github.event.inputs.branch_name }}" >> "$GITHUB_OUTPUT"
121121
fi
122122
123-
setup_and_build:
124-
runs-on: hourly-ci
123+
# --- NEW JOB ---
124+
# This job runs after prep, picks one 'hourly-ci' runner,
125+
# and outputs its name so all other test jobs can target it.
126+
discover_runner:
127+
name: "Discover Self-Hosted Runner"
125128
needs: [prepare-release-branch]
129+
runs-on: hourly-ci
130+
outputs:
131+
runner_name: ${{ steps.get_name.outputs.name }}
132+
steps:
133+
- name: Get runner name
134+
id: get_name
135+
run: |
136+
echo "This workflow will run on: ${{ runner.name }}"
137+
echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
138+
139+
setup_and_build:
140+
# --- UPDATED: Add discover_runner dependency ---
141+
needs: [prepare-release-branch, discover_runner]
142+
# --- UPDATED: Run on the specific node ---
143+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
126144
steps:
127145
- name: "Checkout the release branch"
128146
uses: actions/checkout@v4
@@ -168,8 +186,10 @@ jobs:
168186
echo "Docker image built successfully."
169187
170188
run_unit_tests:
171-
needs: [prepare-release-branch, setup_and_build]
172-
runs-on: hourly-ci
189+
# --- UPDATED: Add discover_runner dependency ---
190+
needs: [prepare-release-branch, setup_and_build, discover_runner]
191+
# --- UPDATED: Run on the specific node ---
192+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
173193
steps:
174194
- name: Run pytest in tests/unit_tests
175195
run: |
@@ -191,8 +211,10 @@ jobs:
191211
echo "Test script exited with code: $EXITCODE"
192212
193213
discover_tests:
194-
runs-on: hourly-ci
195-
needs: [prepare-release-branch]
214+
# --- UPDATED: Add discover_runner dependency ---
215+
needs: [prepare-release-branch, discover_runner]
216+
# --- UPDATED: Run on the specific node ---
217+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
196218
outputs:
197219
matrix: ${{ steps.set-matrix.outputs.matrix }}
198220
steps:
@@ -204,9 +226,9 @@ jobs:
204226
id: set-matrix
205227
run: |
206228
TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
207-
awk '{print $1}' | \
208-
sed 's/()//' | \
209-
jq -R . | jq -s -c . )
229+
awk '{print $1}' | \
230+
sed 's/()//' | \
231+
jq -R . | jq -s -c . )
210232
211233
echo "Discovered test matrix: $TEST_FUNCTIONS"
212234
if [ "$TEST_FUNCTIONS" = "[]" ]; then
@@ -216,8 +238,10 @@ jobs:
216238
echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
217239
218240
e2e:
219-
needs: [prepare-release-branch, setup_and_build, discover_tests]
220-
runs-on: hourly-ci
241+
# --- UPDATED: Add discover_runner dependency ---
242+
needs: [prepare-release-branch, setup_and_build, discover_tests, discover_runner]
243+
# --- UPDATED: Run on the specific node ---
244+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
221245
strategy:
222246
fail-fast: false
223247
matrix:
@@ -245,8 +269,10 @@ jobs:
245269
echo "Test script exited with code: $EXITCODE"
246270
247271
run_data_parallel_test:
248-
needs: [prepare-release-branch, setup_and_build]
249-
runs-on: hourly-ci
272+
# --- UPDATED: Add discover_runner dependency ---
273+
needs: [prepare-release-branch, setup_and_build, discover_runner]
274+
# --- UPDATED: Run on the specific node ---
275+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
250276
steps:
251277
- name: Run Data Parallel test
252278
run: |
@@ -271,8 +297,10 @@ jobs:
271297
echo "Test script exited with code: $EXITCODE"
272298
273299
run_pd_disaggregate_test:
274-
needs: [prepare-release-branch, setup_and_build]
275-
runs-on: hourly-ci
300+
# --- UPDATED: Add discover_runner dependency ---
301+
needs: [prepare-release-branch, setup_and_build, discover_runner]
302+
# --- UPDATED: Run on the specific node ---
303+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
276304
steps:
277305
- name: Run PD disaggregate test
278306
run: |
@@ -298,8 +326,10 @@ jobs:
298326
echo "Test script exited with code: $EXITCODE"
299327
300328
run_hpu_perf_tests:
301-
needs: [prepare-release-branch, setup_and_build]
302-
runs-on: hourly-ci
329+
# --- UPDATED: Add discover_runner dependency ---
330+
needs: [prepare-release-branch, setup_and_build, discover_runner]
331+
# --- UPDATED: Run on the specific node ---
332+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
303333
steps:
304334
- name: Run Sharegpt performance tests with warmup
305335
run: |
@@ -324,6 +354,8 @@ jobs:
324354
summarize_and_notify:
325355
name: "Summarize Test Results and Notify"
326356
runs-on: ubuntu-latest
357+
# --- This job runs on ubuntu-latest, so no runner change is needed ---
358+
# It will correctly wait for all the test jobs to finish
327359
if: needs.prepare-release-branch.result == 'success'
328360
needs:
329361
- prepare-release-branch

.github/workflows/hourly-ci.yaml

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,29 @@ on:
1313
workflow_dispatch: {}
1414

1515
jobs:
16-
# JOB 1: Sets up the environment and builds the Docker image needed for all tests.
16+
# JOB 1: (NEW) Discovers an available runner and locks it for all subsequent jobs.
17+
discover_runner:
18+
runs-on: hourly-ci # Picks any available runner from the 'hourly-ci' pool
19+
outputs:
20+
runner_name: ${{ steps.get_name.outputs.name }}
21+
steps:
22+
- name: Get runner name
23+
id: get_name
24+
# This command gets the unique name of the runner (e.g., "my-runner-123")
25+
# and saves it as an output variable
26+
run: |
27+
echo "This workflow will run on: ${{ runner.name }}"
28+
echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
29+
30+
# JOB 2: (UPDATED) Sets up the environment and builds the Docker image.
1731
setup_and_build:
1832
if: |
1933
github.event_name == 'workflow_dispatch' ||
2034
github.ref == 'refs/heads/main'
21-
runs-on: hourly-ci
22-
needs: discover_tests
35+
# <-- UPDATED: Now needs 'discover_tests' AND 'discover_runner'
36+
needs: [discover_tests, discover_runner]
37+
# <-- UPDATED: Runs on the specific runner from the discover_runner job
38+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
2339
permissions:
2440
contents: read # Required to checkout code and read history
2541
outputs:
@@ -103,9 +119,12 @@ jobs:
103119
EOF
104120
echo "Docker image built successfully."
105121
122+
# JOB 3: (UPDATED)
106123
run_unit_tests:
107-
needs: setup_and_build
108-
runs-on: hourly-ci
124+
# <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
125+
needs: [setup_and_build, discover_runner]
126+
# <-- UPDATED: Runs on the specific runner
127+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
109128
steps:
110129
- name: Run pytest in tests/unit_tests
111130
run: |
@@ -126,8 +145,12 @@ jobs:
126145
EXITCODE=$?
127146
echo "Test script exited with code: $EXITCODE"
128147
148+
# JOB 4: (UPDATED)
129149
discover_tests:
130-
runs-on: hourly-ci
150+
# <-- UPDATED: Now needs 'discover_runner'
151+
needs: discover_runner
152+
# <-- UPDATED: Runs on the specific runner
153+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
131154
outputs:
132155
matrix: ${{ steps.set-matrix.outputs.matrix }}
133156
steps:
@@ -140,9 +163,9 @@ jobs:
140163
# naming convention, excluding the main 'run_all_tests' function itself.
141164
# The final list is formatted into a JSON array required for the matrix strategy.
142165
TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
143-
awk '{print $1}' | \
144-
sed 's/()//' | \
145-
jq -R . | jq -s -c . )
166+
awk '{print $1}' | \
167+
sed 's/()//' | \
168+
jq -R . | jq -s -c . )
146169
147170
echo "Discovered test matrix: $TEST_FUNCTIONS"
148171
# Fail the job if no tests were found.
@@ -152,9 +175,12 @@ jobs:
152175
fi
153176
echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
154177
178+
# JOB 5: (UPDATED)
155179
e2e:
156-
needs: [setup_and_build, discover_tests]
157-
runs-on: hourly-ci
180+
# <-- UPDATED: Now needs 'setup_and_build', 'discover_tests', AND 'discover_runner'
181+
needs: [setup_and_build, discover_tests, discover_runner]
182+
# <-- UPDATED: Runs on the specific runner
183+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
158184
strategy:
159185
fail-fast: false
160186
matrix:
@@ -183,9 +209,12 @@ jobs:
183209
EXITCODE=$?
184210
echo "Test script exited with code: $EXITCODE"
185211
212+
# JOB 6: (UPDATED)
186213
run_data_parallel_test:
187-
needs: setup_and_build
188-
runs-on: hourly-ci
214+
# <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
215+
needs: [setup_and_build, discover_runner]
216+
# <-- UPDATED: Runs on the specific runner
217+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
189218
steps:
190219
- name: Run Data Parallel test
191220
run: |
@@ -209,9 +238,12 @@ jobs:
209238
EXITCODE=$?
210239
echo "Test script exited with code: $EXITCODE"
211240
241+
# JOB 7: (UPDATED)
212242
run_pd_disaggregate_test:
213-
needs: setup_and_build
214-
runs-on: hourly-ci
243+
# <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
244+
needs: [setup_and_build, discover_runner]
245+
# <-- UPDATED: Runs on the specific runner
246+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
215247
steps:
216248
- name: Run PD disaggregate test
217249
run: |
@@ -236,9 +268,12 @@ jobs:
236268
EXITCODE=$?
237269
echo "Test script exited with code: $EXITCODE"
238270
271+
# JOB 8: (UPDATED)
239272
store_last_stable_vllm_commit:
240-
needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test]
241-
runs-on: hourly-ci
273+
# <-- UPDATED: Now needs all test jobs AND 'discover_runner'
274+
needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test, discover_runner]
275+
# <-- UPDATED: Runs on the specific runner
276+
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
242277
permissions:
243278
contents: write # Permission is required to push a commit
244279
steps:

0 commit comments

Comments
 (0)