Skip to content

Fix "BLOCK_SIZE_S3 unrecognized" error caused by config reuse #5913

Fix "BLOCK_SIZE_S3 unrecognized" error caused by config reuse

Fix "BLOCK_SIZE_S3 unrecognized" error caused by config reuse #5913

Workflow file for this run

name: Aiter Test
on:
push:
branches: [main]
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
branches: [main] # Triggers on PRs targeting `main`
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
DOCKER_IMAGE: "rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208"
GPU_ARCH_LIST: "gfx942;gfx950"
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
AITER_TEST: "op_tests"
jobs:
check-signal:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download and check signal artifact
run: ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
build_aiter_image:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: build-only-aiter
needs: check-signal
steps:
- name: Checkout code
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: actions/checkout@v4
# - name: Prepare docker config
# run: |
# export DOCKER_CONFIG="$HOME/.docker"
# mkdir -p "$DOCKER_CONFIG" || true
# cp /docker-config/config.json "$DOCKER_CONFIG/config.json"
# echo "DOCKER_CONFIG=$DOCKER_CONFIG" >> "$GITHUB_ENV"
- name: Generate Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.DOCKER_IMAGE }}
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y aiter
RUN pip install --upgrade pandas zmq einops numpy==1.26.2
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip install --upgrade "ninja>=1.11.1"
RUN pip install tabulate
RUN pip list
RUN rm -rf aiter \
&& git clone ${{ env.GITHUB_REPO_URL }} aiter \
&& cd aiter \
&& git checkout ${{ env.GITHUB_COMMIT_SHA }} \
&& if [ "${GITHUB_REF:-}" = "refs/heads/main" ]; then \
echo "It's main branch, syncing latest CK..."; \
git submodule set-branch --branch develop 3rdparty/composable_kernel; \
git submodule sync && \
git submodule update --init --recursive --remote --jobs 4; \
else \
echo "It's a PR branch, syncing specific CK..."; \
git submodule sync && \
git submodule update --init --recursive --depth 1 --jobs 4; \
fi \
&& pip install -r requirements.txt \
&& echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }} and PREBUILD_KERNELS: 1" \
&& PREBUILD_KERNELS=1 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py build_ext --inplace \
&& pip install -e . \
&& echo "Prebuilding kernels completed"
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
EOF
- name: Show Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork }}
run: cat Dockerfile.mod
- name: Build Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker build --network=host --no-cache -t $IMAGE_TAG -f Dockerfile.mod .
- name: Verify prebuilt kernels
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
echo "=== Prebuilt kernel validation ==="
KERNEL_COUNT=$(docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | wc -l)
echo "Prebuilt kernel .so files: $KERNEL_COUNT"
docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | sort
if [ "$KERNEL_COUNT" -lt 10 ]; then
echo "::warning::Prebuild may have failed: expected at least 10 kernel .so files, found $KERNEL_COUNT. This can cause JIT compilation and OOM at runtime."
else
echo "Prebuild validation passed: $KERNEL_COUNT kernels compiled"
fi
- name: Push Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }}
docker push $IMAGE_TAG
- name: Success message
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
echo "Successfully prepared image: rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}"
split_aiter_tests:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: ubuntu-latest
needs: [check-signal, build_aiter_image]
outputs:
shard_count: 5
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Split Aiter Tests (5 shards)
run: ./.github/scripts/split_tests.sh --shards 5 --test-type aiter
- name: Upload test shard lists as artifact
uses: actions/upload-artifact@v4
with:
name: aiter_shards
path: aiter_shard_*.list
standard:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
name: Standard Tests (1 GPU)
needs: [build_aiter_image, split_aiter_tests]
strategy:
fail-fast: false
matrix:
include:
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 0
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 1
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 2
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 3
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 4
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 0
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 1
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 2
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 3
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 4
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Docker login
run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true
- name: Download test shard lists
uses: actions/download-artifact@v4
with:
name: aiter_shards
- name: List test shard files
run: |
ls -l aiter_shard_*.list
- name: Export test file list for this shard as env
id: set_shard_files
run: |
echo "AITER_TEST=$(cat aiter_shard_${{ matrix.shard_idx }}.list)" >> $GITHUB_ENV
echo "$AITER_TEST"
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
IMAGE_TAG=${{ env.DOCKER_IMAGE }}
else
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-e AITER_TEST="${AITER_TEST}" \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
$IMAGE_TAG
- name: Setup Aiter for fork PR
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
set -ex
git submodule sync && git submodule update --init --recursive --depth 1 --jobs 4
echo "Setting up Aiter for fork PR..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Show Aiter version
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "pip show amd-aiter || true"
- name: Tests
run: |
set -ex
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
docker exec \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=20 SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh"
else
docker exec \
-w /workspace \
aiter_test \
bash -c "SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh"
fi
- name: Collect test logs
if: always()
run: |
echo "Collecting test logs..."
echo "Aiter Operator Tests Summary:" >> $GITHUB_STEP_SUMMARY
python3 ./.github/scripts/collect_logs.py latest_test.log >> $GITHUB_STEP_SUMMARY
- name: Upload test logs
uses: actions/upload-artifact@v4
if: success()
with:
name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
standard-test-finish:
if: ${{ !github.event.pull_request.draft }}
name: Standard Test Results
runs-on: ubuntu-latest
needs: [standard]
steps:
- name: Download all test logs
uses: actions/download-artifact@v4
with:
pattern: standard-test-log-*-shard-*
path: .
- name: List test logs
run: |
ls -l standard-test-log-*
- name: Check Standard Test Results
run: |
set -ex
echo "Checking Standard Test Results..."
all_passed=true
for shard in {0..4}; do
for runner in {linux-aiter-mi355-1,aiter-1gpu-runner}; do
if [ ! -f standard-test-log-${runner}-shard-${shard}/latest_test.log ]; then
echo "Test report for ${runner} shard ${shard} not found."
all_passed=false
break
fi
done
done
if [ "$all_passed" = true ]; then
echo "All tests passed."
else
echo "Test failures or errors detected."
exit 1
fi
multi-gpu:
name: Multi-GPU Tests (8 GPU)
if: github.ref == 'refs/heads/main'
needs: build_aiter_image
strategy:
fail-fast: false
matrix:
include:
- runner: linux-aiter-mi355-8
label: MI355
- runner: aiter-8gpu-runner
label: MI325
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Docker login
run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
IMAGE_TAG=${{ env.DOCKER_IMAGE }}
else
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
$IMAGE_TAG
- name: Setup Aiter for fork PR
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
set -ex
echo "Setting up Aiter for fork PR..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Show Aiter version
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "pip show amd-aiter || true"
- name: Tests
run: |
set -ex
docker exec \
-e MULTIGPU=TRUE \
-w /workspace \
aiter_test \
bash -c "./.github/scripts/aiter_test.sh"
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: multigpu-test-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh