Skip to content

GPT OSS Integration Code #4439

GPT OSS Integration Code

GPT OSS Integration Code #4439

Workflow file for this run

name: Basic HPU test suite
on:
pull_request_target:
types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
branches:
- main
- releases/**
# Allow manual triggering for testing purposes
workflow_dispatch: {}
permissions:
pull-requests: write
# This line allows it to read the status of checks like DCO
checks: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
gatekeeper:
runs-on: ubuntu-latest
permissions:
# Required to read the status of checks and PR details
checks: read
# Add this line to allow posting comments on PRs
pull-requests: write
outputs:
# This output will be used to signal if the main CI should run
run_ci: ${{ steps.check_conditions.outputs.run_ci }}
reason: ${{ steps.check_conditions.outputs.reason }}
steps:
# Add this checkout step
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
clean: true # Ensure a clean workspace before checkout
fetch-depth: 0
- name: Check for other blocking conditions
id: check_conditions
# Provide the GITHUB_TOKEN for the gh CLI to authenticate
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
BLOCKING_REASON=""
# --- New check to see if the PR is behind the base branch ---
echo "Checking if the PR is behind the base branch..."
# Fetch just the latest commit from the base branch for comparison
git fetch origin ${{ github.event.pull_request.base.ref }} --depth=1
behind_count=$(git rev-list --count HEAD..origin/${{ github.event.pull_request.base.ref }})
if_pr_is_behind="false"
if [[ $behind_count -gt 0 ]]; then
echo "PR is behind by ${behind_count} commit(s)."
if_pr_is_behind="true"
else
echo "PR is up-to-date with the base branch."
fi
# --- End of new check ---
if [[ "${{ github.event.pull_request.draft }}" == "true" ]]; then
BLOCKING_REASON="This is a **Draft PR**. Please mark it as 'Ready for Review' to trigger the CI."
# Use the 'gh' CLI to fetch the LIVE mergeable state after the sleep
elif [[ "$if_pr_is_behind" == "true" ]]; then
BLOCKING_REASON="Your branch is **behind the base branch**. Please merge or rebase to get the latest changes."
fi
if [[ -n "$BLOCKING_REASON" ]]; then
{
echo "run_ci=false"
echo "reason<<EOF"
echo "$BLOCKING_REASON"
echo "EOF"
} >> "$GITHUB_OUTPUT"
else
echo "run_ci=true" >> "$GITHUB_OUTPUT"
fi
echo "Reason (if blocked): $BLOCKING_REASON"
- name: Post comment if CI is blocked
# This ensures that even if commenting fails, the next step will run
continue-on-error: true
# Only run this step if the check failed
if: steps.check_conditions.outputs.run_ci == 'false'
uses: peter-evans/create-or-update-comment@v4
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
### 🚧 CI Blocked
The main CI workflow was not started for the following reason:
> ${{ steps.check_conditions.outputs.reason }}
- name: Fail the job to block downstream CI
if: steps.check_conditions.outputs.run_ci == 'false'
run: |
echo "Failing this job to prevent the main CI from running."
exit 1
# --- NEW JOB ---
# This job runs first on the self-hosted pool, picks a runner,
# and outputs its name so all other jobs can target it.
# If PR title contains [FIX_FOR_VLLM_LATEST], it uses the hourly-ci runner pool.
discover_runner:
needs: gatekeeper
runs-on: ${{ contains(github.event.pull_request.title, '[FIX_FOR_VLLM_LATEST]') && 'hourly-ci' || 'pr-ci' }}
outputs:
runner_name: ${{ steps.get_name.outputs.name }}
steps:
- name: Get runner name
id: get_name
run: |
echo "This workflow will run on: ${{ runner.name }}"
echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
discover_tests:
# --- UPDATED: Add discover_runner dependency ---
needs: discover_runner
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
clean: true # Ensure a clean workspace before checkout
- name: Discover test functions
id: set-matrix
run: |
# This command robustly discovers all functions in the script that match the 'run_*'
# naming convention, excluding the main 'run_all_tests' function itself.
# The final list is formatted into a JSON array required for the matrix strategy.
TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
awk '{print $1}' | \
sed 's/()//' | \
jq -R . | jq -s -c . )
echo "Discovered test matrix: $TEST_FUNCTIONS"
# Fail the job if no tests were found.
if [ "$TEST_FUNCTIONS" = "[]" ]; then
echo "::error::No test functions were discovered. Failing the workflow."
exit 1
fi
echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
pre-commit:
# This job runs in parallel with the build job
needs: gatekeeper
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
clean: true # Ensure a clean workspace before checkout
- name: Setup Python
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: "3.12"
- name: Add problem matchers
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
echo "::add-matcher::.github/workflows/matchers/mypy.json"
- name: Run pre-commit hooks
uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --all-files --hook-stage manual
pre_merge_hpu_test_build:
if: >
!contains(github.event.pull_request.labels.*.name, 'skip-gaudi-tests')
# --- UPDATED: Add discover_runner dependency ---
needs: [pre-commit, discover_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
permissions:
contents: read # Required to checkout code and read history
outputs:
target_commit: ${{ steps.checkout_vllm_upstream.outputs.TEST_VLLM_COMMIT }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
# Fetch full history for accurate commit comparison
ref: ${{ github.event.pull_request.head.sha }}
clean: true # Ensure a clean workspace before checkout
- name: Read Stable Commit from Target Branch
id: read_stable_commit
run: |
TARGET_FILE="VLLM_STABLE_COMMIT"
COMMIT_HASH=""
# 1. Primary Check: Look in the current branch
echo "INFO: Checking for '$TARGET_FILE' in the current branch..."
if [ -f "$TARGET_FILE" ]; then
COMMIT_HASH=$(tr -d '[:space:]' < "$TARGET_FILE")
if [ -n "$COMMIT_HASH" ]; then
echo "✅ Found stable commit in current branch: $COMMIT_HASH"
else
echo "::warning:: File '$TARGET_FILE' is empty in current branch. Proceeding to fallback."
fi
else
echo "INFO: File not found in current branch. Proceeding to fallback."
fi
# 2. Fallback Check: Look in the remote branch if the first check failed
if [ -z "$COMMIT_HASH" ]; then
TARGET_BRANCH="vllm/last-good-commit-for-vllm-gaudi"
echo "➡️ Fallback: Attempting to read '$TARGET_FILE' from branch '$TARGET_BRANCH'..."
git fetch origin "$TARGET_BRANCH" --no-tags --depth=1 || true
# Read file from the remote branch; suppress errors if not found
COMMIT_HASH=$(git show "origin/$TARGET_BRANCH:$TARGET_FILE" 2>/dev/null) || COMMIT_HASH=""
COMMIT_HASH=$(echo "$COMMIT_HASH" | tr -d '[:space:]')
if [ -n "$COMMIT_HASH" ]; then
echo "✅ Found stable commit in fallback branch: $COMMIT_HASH"
else
echo "::error:: Fallback failed. Could not find a valid commit hash in either location."
fi
fi
# 3. Export the final result
echo "VLLM_STABLE_COMMIT=$COMMIT_HASH" >> "$GITHUB_ENV"
- name: Determine Target Commit Based on PR Title
id: determine_commit
# ⬇️ 1. Add this 'env' block to safely import the title
env:
PR_TITLE_ENV: ${{ github.event.pull_request.title }}
run: |
# Default to the stable commit from the previous step
FINAL_TARGET="${VLLM_STABLE_COMMIT}"
# ⬇️ 2. Read the title from the safe environment variable
PR_TITLE="$PR_TITLE_ENV"
echo "Pull Request Title: \"${PR_TITLE}\""
# Check if the title contains the special flag
if [[ "$PR_TITLE" == *"[FIX_FOR_VLLM_LATEST]"* ]]; then
echo "✅ Flag '[FIX_FOR_VLLM_LATEST]' found in title."
echo "Setting target to 'main'."
FINAL_TARGET="vllm-upstream/main"
elif [ -z "$FINAL_TARGET" ]; then
# This is the corrected check for a missing commit hash
echo "⚠️ Stable commit was not found. Defaulting target to 'main'."
FINAL_TARGET="vllm-upstream/main"
else
echo "Using stable commit: ${FINAL_TARGET}"
fi
# Export the result to an environment variable for subsequent steps
echo "TARGET_COMMIT=${FINAL_TARGET}" >> "$GITHUB_ENV"
echo "commit_ref=${FINAL_TARGET}" >> "$GITHUB_OUTPUT"
- name: Add vLLM upstream as a remote and checkout target commit
id: checkout_vllm_upstream
run: |
echo "Attempting to remove remote branch if it exists..."
git remote remove vllm-upstream || true
echo "add vllm-upstream remote..."
git remote add vllm-upstream https://github.com/vllm-project/vllm.git
echo "Checking out code from '${{ env.TARGET_COMMIT }}'..."
git fetch vllm-upstream --depth=100
git checkout "${{ env.TARGET_COMMIT }}"
COMMIT_ID=$(git rev-parse HEAD)
echo "TEST_VLLM_COMMIT=$COMMIT_ID" >> "$GITHUB_OUTPUT"
echo "TEST_VLLM_COMMIT=$COMMIT_ID" >> "$GITHUB_ENV"
echo "Target commit ID for testing: $COMMIT_ID"
- name: Checkout repository
uses: actions/checkout@v4
with:
# Fetch full history for accurate commit comparison
ref: ${{ github.event.pull_request.head.sha }}
clean: true # Ensure a clean workspace before checkout
- name: Setup Docker environment and build image
# Using a multi-line string for the Dockerfile is generally fine,
# but consider moving it to a separate file for better readability/maintainability
run: |
echo "Attempting to build Docker image..."
# Ensure this image is accessible from GitHub Actions (e.g., public registry or authenticated private registry).
docker build \
--no-cache \
--build-arg VLLM_COMMIT_ARG=${{ env.TEST_VLLM_COMMIT }} \
-t hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
-f - . <<EOF
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-installer-2.9.0:latest
ARG VLLM_COMMIT_ARG
COPY ./ /workspace/vllm-gaudi
WORKDIR /workspace
RUN git clone https://github.com/vllm-project/vllm.git vllm
WORKDIR /workspace/vllm
RUN git checkout \$VLLM_COMMIT_ARG
# Pinning versions in requirements might be good practice for CI consistency
RUN pip install pytest pytest_asyncio pytest-timeout
RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
# Ensure setup.py install works as expected
RUN bash -c 'pip install -r <(sed "/^[torch]/d" requirements/build.txt)'
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation .
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/vllm-gaudi
RUN pip install -e .
WORKDIR /workspace
# Symlinks. Ensure target paths are correct relative to the current WORKDIR
# '/workspace/vllm/tests' is correct
# Ensure these are created relative to the /workspace/ directory
RUN ln -s /workspace/vllm/tests /workspace/tests \
&& ln -s /workspace/vllm/examples /workspace/examples \
&& ln -s /workspace/vllm/benchmarks /workspace/benchmarks
EOF
echo "Docker image built successfully."
hpu_unit_tests:
# --- UPDATED: Add discover_runner dependency ---
needs: [pre_merge_hpu_test_build, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
steps:
- name: Run pytest in tests/unit_tests
run: |
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test-unit-tests-${{ github.event.pull_request.head.sha }} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 unit tests"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test-unit-tests-${{ github.event.pull_request.head.sha }} --network=host \
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
/bin/bash -c "pytest -vvv --timeout=300 --durations=10 --durations-min=1.0 /workspace/vllm-gaudi/tests/unit_tests"
EXITCODE=$?
echo "Test script exited with code: $EXITCODE"
hpu_pd_tests:
# --- UPDATED: Add discover_runner dependency ---
needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
steps:
- name: Run test scripts
run: |
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test-pd-tests-${{ github.event.pull_request.head.sha }} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 nixl pd tests"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test-pd-tests-${{ github.event.pull_request.head.sha }} --network=host \
--privileged \
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
-v /mnt/wheels_cache:/tmp/wheels_cache \
hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
/bin/bash -c "
pip install lm-eval[api] &&
cd /workspace/vllm-gaudi/tests/unit_tests &&
./run_accuracy_test.sh
"
EXITCODE=$?
echo "Test script exited with code: $EXITCODE"
hpu_perf_tests:
# --- UPDATED: Add discover_runner dependency ---
needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
steps:
- name: Run test scripts
run: |
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test-perf-tests-${{ github.event.pull_request.head.sha }} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 perf tests"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test-perf-tests-${{ github.event.pull_request.head.sha }} --network=host \
--privileged \
-e HABANA_VISIBLE_DEVICES=all \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-e HF_HOME=/workspace/hf_cache \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
/bin/bash "/workspace/vllm-gaudi/tests/full_tests/ci_perf_tests.sh"
EXITCODE=$?
echo "Test script exited with code: $EXITCODE"
hpu_dp_tests:
# --- UPDATED: Add discover_runner dependency ---
needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
steps:
- name: Run test scripts
run: |
EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test-dp-tests-${{ github.event.pull_request.head.sha }} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 dp tests"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test-dp-tests-${{ github.event.pull_request.head.sha }} --network=host \
--privileged \
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_LAZY_MODE=1 \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
/bin/bash -c "python -u /workspace/vllm-gaudi/examples/data_parallel.py --dp-size 2 --tp-size 2"
EXITCODE=$?
echo "Test script exited with code: $EXITCODE"
e2e:
# --- UPDATED: Add discover_runner dependency ---
needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
strategy:
fail-fast: false
matrix:
# The list of test functions is dynamically populated from the output of the 'discover_tests' job.
test_function: ${{ fromJson(needs.discover_tests.outputs.matrix) }}
steps:
- name: Run test suite - ${{ matrix.test_function }}
run: |
EXITCODE=1
CONTAINER_NAME="hpu-plugin-test-e2e-${{ github.event.pull_request.head.sha }}-${{ matrix.test_function }}"
# Ensure the container is removed upon exit, regardless of success or failure.
remove_docker_containers() { docker rm -f $CONTAINER_NAME || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin test: ${{ matrix.test_function }}"
docker run --rm --runtime=habana --name=$CONTAINER_NAME --network=host \
--privileged \
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-pre-merge-${{ github.event.pull_request.head.sha }} \
/bin/bash "/workspace/vllm-gaudi/tests/full_tests/ci_gsm8k_tests.sh" "${{ matrix.test_function }}"
EXITCODE=$?
echo "Test script exited with code: $EXITCODE"
pre_merge_hpu_test:
# --- UPDATED: Add discover_runner dependency ---
needs: [hpu_unit_tests, e2e, hpu_perf_tests, discover_runner]
# --- UPDATED: Run on the specific node ---
runs-on: ${{ needs.discover_runner.outputs.runner_name }}
# This job is required to pass for pre-merge CI. By itself it does nothing, and will only pass if all jobs specified in "needs" list pass.
steps:
- name: Succeeded if all previous jobs passed
run: echo "All previous jobs passed."
# This is a new job, at the same level as hpu-test-suite
post-comment:
name: Post PR Comment
# This job runs after hpu-test-suite completes
needs: [pre_merge_hpu_test, pre_merge_hpu_test_build]
runs-on: ubuntu-latest
permissions:
# Permissions are required on a per-job basis
pull-requests: write
steps:
- name: Post Comment on Success
if: needs.pre_merge_hpu_test.result == 'success'
uses: peter-evans/create-or-update-comment@v4
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
### ✅ CI Passed
All checks passed successfully against the following vllm commit:
**`${{ needs.pre_merge_hpu_test_build.outputs.target_commit }}`**