MLA PS mode add metadata split reference code #5916
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Aiter Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| types: [opened, synchronize, reopened, ready_for_review] | |
| branches: [main] # Triggers on PRs targeting `main` | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| DOCKER_IMAGE: "rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208" | |
| GPU_ARCH_LIST: "gfx942;gfx950" | |
| GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }} | |
| GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} | |
| AITER_TEST: "op_tests" | |
| jobs: | |
| check-signal: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download and check signal artifact | |
| run: ./.github/scripts/check_signal.sh | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GITHUB_SHA: ${{ github.sha }} | |
| build_aiter_image: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: build-only-aiter | |
| needs: check-signal | |
| steps: | |
| - name: Checkout code | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| uses: actions/checkout@v4 | |
| # - name: Prepare docker config | |
| # run: | | |
| # export DOCKER_CONFIG="$HOME/.docker" | |
| # mkdir -p "$DOCKER_CONFIG" || true | |
| # cp /docker-config/config.json "$DOCKER_CONFIG/config.json" | |
| # echo "DOCKER_CONFIG=$DOCKER_CONFIG" >> "$GITHUB_ENV" | |
| - name: Generate Dockerfile | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| cat <<EOF > Dockerfile.mod | |
| FROM ${{ env.DOCKER_IMAGE }} | |
| RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true | |
| RUN pip uninstall -y aiter | |
| RUN pip install --upgrade pandas zmq einops numpy==1.26.2 | |
| RUN pip install --upgrade "pybind11>=3.0.1" | |
| RUN pip install --upgrade "ninja>=1.11.1" | |
| RUN pip install tabulate | |
| RUN pip list | |
| RUN rm -rf aiter \ | |
| && git clone ${{ env.GITHUB_REPO_URL }} aiter \ | |
| && cd aiter \ | |
| && git checkout ${{ env.GITHUB_COMMIT_SHA }} \ | |
| && if [ "${GITHUB_REF:-}" = "refs/heads/main" ]; then \ | |
| echo "It's main branch, syncing latest CK..."; \ | |
| git submodule set-branch --branch develop 3rdparty/composable_kernel; \ | |
| git submodule sync && \ | |
| git submodule update --init --recursive --remote --jobs 4; \ | |
| else \ | |
| echo "It's a PR branch, syncing specific CK..."; \ | |
| git submodule sync && \ | |
| git submodule update --init --recursive --depth 1 --jobs 4; \ | |
| fi \ | |
| && pip install -r requirements.txt \ | |
| && echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }} and PREBUILD_KERNELS: 1" \ | |
| && PREBUILD_KERNELS=1 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py build_ext --inplace \ | |
| && pip install -e . \ | |
| && echo "Prebuilding kernels completed" | |
| RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true | |
| EOF | |
| - name: Show Dockerfile | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: cat Dockerfile.mod | |
| - name: Build Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| docker build --network=host --no-cache -t $IMAGE_TAG -f Dockerfile.mod . | |
| - name: Verify prebuilt kernels | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| echo "=== Prebuilt kernel validation ===" | |
| KERNEL_COUNT=$(docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | wc -l) | |
| echo "Prebuilt kernel .so files: $KERNEL_COUNT" | |
| docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | sort | |
| if [ "$KERNEL_COUNT" -lt 10 ]; then | |
| echo "::warning::Prebuild may have failed: expected at least 10 kernel .so files, found $KERNEL_COUNT. This can cause JIT compilation and OOM at runtime." | |
| else | |
| echo "Prebuild validation passed: $KERNEL_COUNT kernels compiled" | |
| fi | |
| - name: Push Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} | |
| docker push $IMAGE_TAG | |
| - name: Success message | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| echo "Successfully prepared image: rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}" | |
| split_aiter_tests: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: ubuntu-latest | |
| needs: [check-signal, build_aiter_image] | |
| outputs: | |
| shard_count: 5 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Split Aiter Tests (5 shards) | |
| run: ./.github/scripts/split_tests.sh --shards 5 --test-type aiter | |
| - name: Upload test shard lists as artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: aiter_shards | |
| path: aiter_shard_*.list | |
| standard: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| name: Standard Tests (1 GPU) | |
| needs: [build_aiter_image, split_aiter_tests] | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - runner: linux-aiter-mi355-1 | |
| label: MI355 | |
| shard_total: 5 | |
| shard_idx: 0 | |
| - runner: linux-aiter-mi355-1 | |
| label: MI355 | |
| shard_total: 5 | |
| shard_idx: 1 | |
| - runner: linux-aiter-mi355-1 | |
| label: MI355 | |
| shard_total: 5 | |
| shard_idx: 2 | |
| - runner: linux-aiter-mi355-1 | |
| label: MI355 | |
| shard_total: 5 | |
| shard_idx: 3 | |
| - runner: linux-aiter-mi355-1 | |
| label: MI355 | |
| shard_total: 5 | |
| shard_idx: 4 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 0 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 1 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 2 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 3 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 4 | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Docker login | |
| run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true | |
| - name: Download test shard lists | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: aiter_shards | |
| - name: List test shard files | |
| run: | | |
| ls -l aiter_shard_*.list | |
| - name: Export test file list for this shard as env | |
| id: set_shard_files | |
| run: | | |
| echo "AITER_TEST=$(cat aiter_shard_${{ matrix.shard_idx }}.list)" >> $GITHUB_ENV | |
| echo "$AITER_TEST" | |
| - name: Run the container | |
| run: | | |
| set -ex | |
| echo "Starting container: aiter_test" | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| IMAGE_TAG=${{ env.DOCKER_IMAGE }} | |
| else | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| fi | |
| docker run -dt \ | |
| --device=/dev/kfd $DEVICE_FLAG \ | |
| --shm-size=16G \ | |
| --network=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -e AITER_TEST="${AITER_TEST}" \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name aiter_test \ | |
| $IMAGE_TAG | |
| - name: Setup Aiter for fork PR | |
| if: ${{ github.event.pull_request.head.repo.fork }} | |
| run: | | |
| set -ex | |
| git submodule sync && git submodule update --init --recursive --depth 1 --jobs 4 | |
| echo "Setting up Aiter for fork PR..." | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh" | |
| - name: Show Aiter version | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "pip show amd-aiter || true" | |
| - name: Tests | |
| run: | | |
| set -ex | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "MAX_JOBS=20 SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh" | |
| else | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh" | |
| fi | |
| - name: Collect test logs | |
| if: always() | |
| run: | | |
| echo "Collecting test logs..." | |
| echo "Aiter Operator Tests Summary:" >> $GITHUB_STEP_SUMMARY | |
| python3 ./.github/scripts/collect_logs.py latest_test.log >> $GITHUB_STEP_SUMMARY | |
| - name: Upload test logs | |
| uses: actions/upload-artifact@v4 | |
| if: success() | |
| with: | |
| name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }} | |
| path: latest_test.log | |
| - name: Cleanup container | |
| if: always() | |
| run: | | |
| docker rm -f aiter_test || true | |
| standard-test-finish: | |
| if: ${{ !github.event.pull_request.draft }} | |
| name: Standard Test Results | |
| runs-on: ubuntu-latest | |
| needs: [standard] | |
| steps: | |
| - name: Download all test logs | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: standard-test-log-*-shard-* | |
| path: . | |
| - name: List test logs | |
| run: | | |
| ls -l standard-test-log-* | |
| - name: Check Standard Test Results | |
| run: | | |
| set -ex | |
| echo "Checking Standard Test Results..." | |
| all_passed=true | |
| for shard in {0..4}; do | |
| for runner in {linux-aiter-mi355-1,aiter-1gpu-runner}; do | |
| if [ ! -f standard-test-log-${runner}-shard-${shard}/latest_test.log ]; then | |
| echo "Test report for ${runner} shard ${shard} not found." | |
| all_passed=false | |
| break | |
| fi | |
| done | |
| done | |
| if [ "$all_passed" = true ]; then | |
| echo "All tests passed." | |
| else | |
| echo "Test failures or errors detected." | |
| exit 1 | |
| fi | |
| multi-gpu: | |
| name: Multi-GPU Tests (8 GPU) | |
| if: github.ref == 'refs/heads/main' | |
| needs: build_aiter_image | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - runner: linux-aiter-mi355-8 | |
| label: MI355 | |
| - runner: aiter-8gpu-runner | |
| label: MI325 | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Docker login | |
| run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true | |
| - name: Run the container | |
| run: | | |
| set -ex | |
| echo "Starting container: aiter_test" | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| IMAGE_TAG=${{ env.DOCKER_IMAGE }} | |
| else | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| fi | |
| docker run -dt \ | |
| --device=/dev/kfd $DEVICE_FLAG \ | |
| --shm-size=16G \ | |
| --network=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name aiter_test \ | |
| $IMAGE_TAG | |
| - name: Setup Aiter for fork PR | |
| if: ${{ github.event.pull_request.head.repo.fork }} | |
| run: | | |
| set -ex | |
| echo "Setting up Aiter for fork PR..." | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh" | |
| - name: Show Aiter version | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "pip show amd-aiter || true" | |
| - name: Tests | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -e MULTIGPU=TRUE \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "./.github/scripts/aiter_test.sh" | |
| - name: Upload test logs | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: multigpu-test-${{ matrix.runner }} | |
| path: latest_test.log | |
| - name: Cleanup container | |
| if: always() | |
| run: | | |
| docker rm -f aiter_test || true | |
| - name: Clean up Rocm processes | |
| if: always() | |
| run: | | |
| ./.github/scripts/clean_up_rocm.sh |