Skip to content

[CI] Ensure proper exit code handling in coverage build step #2909

[CI] Ensure proper exit code handling in coverage build step

[CI] Ensure proper exit code handling in coverage build step #2909

Workflow file for this run

name: CI-H
on:
pull_request:
types: [opened, synchronize]
branches: [develop, release/**]
permissions: read-all
concurrency:
group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
cancel-in-progress: true
env:
PR_ID: ${{ github.event.pull_request.number }}
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
TASK: paddle-CI-${{ github.event.pull_request.number }}-coverage
ci_scripts: /paddle/ci
BRANCH: ${{ github.base_ref }}
work_dir: /paddle
PADDLE_ROOT: /paddle
GIT_PR_ID: ${{ github.event.pull_request.number }}
CI_name: h-coverage
CFS_DIR: /home/data/cfs
no_proxy: "bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
defaults:
run:
shell: bash
jobs:
clone:
name: Coverage clone
uses: ./.github/workflows/_Clone-linux.yml
with:
workflow-name: "coverage"
clone_dir: h-ci
build:
name: Coverage build
needs: [clone]
if: needs.clone.outputs.can-skip != 'true'
runs-on:
group: GZ_BD-CPU
outputs:
can-skip: ${{ steps.check-bypass.outputs.can-skip }}
steps:
- name: Check docker image and run container
env:
CACHE_DIR: "/root/.cache/coverage"
CCACHE_DIR: "/home/data/shared/.ccache/l1" # L1 cache on machine shared dir
CCACHE_SECONDARY_STORAGE: "file:///home/data/cfs/.ccache/l2" # L2 cache on cfs
FLAGS_fraction_of_gpu_memory_to_use: 0.15
CTEST_PARALLEL_LEVEL: 2
WITH_GPU: "ON"
CUDA_ARCH_NAME: Hopper
WITH_AVX: "ON"
PADDLE_VERSION: 0.0.0
CUDA_VISIBLE_DEVICES: 0,1
WITH_DISTRIBUTE: "ON"
LITE_GIT_TAG: develop
WITH_UNITY_BUILD: "ON"
WITH_FA_BUILD_WITH_CACHE: "ON"
PY_VERSION: "3.10"
INFERENCE_DEMO_INSTALL_DIR: /root/.cache/coverage
CCACHE_MAXSIZE: 50G
CCACHE_LIMIT_MULTIPLE: 0.8
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
container_name=${TASK}-build-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> ${{ github.env }}
docker_image=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test
docker run -d -t --name ${container_name} \
-v "/home/data/cfs:/home/data/cfs" \
-v "/home/data/cfs/.cache:/root/.cache" \
-v "/home/data/shared:/home/data/shared" \
-v "/dev/shm:/dev/shm" \
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-v ${{ github.workspace }}:/paddle \
-e CI_name \
-e BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e PADDLE_ROOT \
-e GIT_PR_ID \
-e CACHE_DIR \
-e CCACHE_DIR \
-e CCACHE_SECONDARY_STORAGE \
-e ci_scripts \
-e FLAGS_fraction_of_gpu_memory_to_use \
-e CTEST_PARALLEL_LEVEL \
-e WITH_GPU \
-e CUDA_ARCH_NAME \
-e WITH_AVX \
-e PADDLE_VERSION \
-e WITH_DISTRIBUTE \
-e LITE_GIT_TAG \
-e WITH_UNITY_BUILD \
-e WITH_FA_BUILD_WITH_CACHE \
-e PY_VERSION \
-e INFERENCE_DEMO_INSTALL_DIR \
-e CCACHE_MAXSIZE \
-e CCACHE_LIMIT_MULTIPLE \
-e GITHUB_TOKEN \
-e GITHUB_API_TOKEN \
-e CFS_DIR \
-e no_proxy \
-w /paddle --network host ${docker_image}
- name: Download paddle.tar.gz and update test branch
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
rm -rf * .[^.]*
set -e
echo "Downloading Paddle.tar.gz"
wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-ci/${PR_ID}/${COMMIT_ID}/Paddle.tar.gz --no-check-certificate
echo "Extracting Paddle.tar.gz"
tar -xf Paddle.tar.gz --strip-components=1
rm Paddle.tar.gz
git config --global --add safe.directory "*"
git remote -v
set +e
git remote add upstream https://github.com/PaddlePaddle/Paddle.git
set -e
git config pull.rebase false
git checkout test
echo "Pull upstream $BRANCH"
source ${{ github.workspace }}/../../../proxy
bash ci/git_pull.sh $BRANCH
git submodule update
'
- name: Check bypass
id: check-bypass
uses: ./.github/actions/check-bypass
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
workflow-name: h-ci
- name: Build
if: steps.check-bypass.outputs.can-skip != 'true'
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
flashattn_version=$(git submodule status|grep flashattn|awk "{print \$1}"|sed "s#-##g")
echo flashattn_version:$flashattn_version
url="https://xly-devops.bj.bcebos.com/gpups/flash-attention/cu90/flashattn_libs_${flashattn_version}.tar"
echo url:$url
url_return=`curl -s -o /dev/null -w "%{http_code}" $url`
if [ "$url_return" != "200" ];then
echo "flashattn cache not found, please contact umiswing"
exit 7
fi
mkdir -p ${CFS_DIR}/.cache/coverage
mkdir -p ${CFS_DIR}/.ccache/coverage
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.9/compat
source ${{ github.workspace }}/../../../proxy
bash ${ci_scripts}/cmake-predownload.sh
pip install -r python/requirements.txt
mkdir -p build && cd build
ccache -z
cmake .. -DPY_VERSION=3.10 -DWITH_GPU=ON -DWITH_DISTRIBUTE=ON -DWITH_TESTING=ON -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN="90" -DFA_JOB_POOLS_COMPILE=1 -DWITH_CUDNN_FRONTEND=ON -DON_INFER=OFF -DWITH_NVSHMEM=ON
make -j20
EXIT_CODE=$?
ccache -s
exit $EXIT_CODE
'
- name: Clean up env
if: steps.check-bypass.outputs.can-skip != 'true'
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
source ~/.bashrc
source ${ci_scripts}/utils.sh; clean_build_files
rm -rf $(find . -name "*.a")
rm -rf $(find . -name "*.o")
rm -rf lib.linux-x86_64-3.9
find ./ -name "eager_generator" -or -name "kernel_signature_generator" -or -name "eager_legacy_op_function_generator" | xargs rm -rf
rm -rf ./python/build/lib.linux-x86_64-3.9/
cd "${work_dir}/build/third_party" && find $(ls | grep -v "dlpack" | grep -v "install" | grep -v "eigen3" | grep -v "gflags") -type f ! -name "*.so" -a ! -name "libdnnl.so*" -delete
cd /
tar --use-compress-program="pzstd -1" -cf Paddle.tar.gz paddle
'
- name: Upload coverage product
if: steps.check-bypass.outputs.can-skip != 'true'
env:
home_path: ${{ github.workspace }}/..
bos_file: ${{ github.workspace }}/../bos_retry/BosClient.py
paddle_whl: paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
echo "::group::Install bce-python-sdk"
python -m pip install bce-python-sdk==0.8.74
echo "::endgroup::"
export AK=paddle
export SK=paddle
if [ ! -f "${{ env.bos_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/bos_retry.tar.gz https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
mkdir ${{ env.home_path }}/bos_retry
tar xf ${{ env.home_path }}/bos_retry.tar.gz -C ${{ env.home_path }}/bos_retry
fi
cd /paddle
mv /Paddle.tar.gz .
cp ./build/python/dist/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl .
echo "Uploading Paddle.tar.gz"
python ${{ env.bos_file }} Paddle.tar.gz paddle-github-action/PR/h-coverage/${{ env.PR_ID }}/${{ env.COMMIT_ID }}
echo "Uploading coverage wheel"
python ${{ env.bos_file }} ${{ env.paddle_whl }} paddle-github-action/PR/h-coverage/${{ env.PR_ID }}/${{ env.COMMIT_ID }}
echo "End Upload"
'
- name: Terminate and delete the container
if: ${{ steps.check-bypass.outputs.can-skip != 'true' && always() }}
run: |
set +e
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker stop ${{ env.container_name }}
docker rm ${{ env.container_name }}
test:
name: Coverage test
needs: [build]
if: needs.build.outputs.can-skip != 'true'
runs-on:
group: H-Coverage
steps:
- name: Determine the runner
run: |
runner_name=`(echo $PWD|awk -F '/' '{print $3}')`
echo $runner_name
wget -q https://xly-devops.bj.bcebos.com/utils.sh
source utils.sh
determine_gpu_runner ${runner_name}
- name: Check docker image and run container
env:
CACHE_DIR: "/root/.cache/coverage"
CCACHE_DIR: "/root/.ccache/coverage"
FLAGS_fraction_of_gpu_memory_to_use: 0.15
CTEST_PARALLEL_LEVEL: 2
WITH_GPU: "ON"
CUDA_ARCH_NAME: Hopper
WITH_AVX: "ON"
COVERALLS_UPLOAD: "ON"
PADDLE_VERSION: 0.0.0
WITH_DISTRIBUTE: "ON"
WITH_UNITY_BUILD: "ON"
PY_VERSION: "3.10"
WITH_SHARED_PHI: "ON"
GPU_DEVICES: ${{ env.GPU_DEVICES }}
WITH_CINN: "ON"
INFERENCE_DEMO_INSTALL_DIR: /root/.cache/coverage
CCACHE_MAXSIZE: 200G
CCACHE_LIMIT_MULTIPLE: 0.8
FLAGS_PIR_OPTEST: "TRUE"
ON_INFER: "ON"
COVERAGE_FILE: ${{ github.workspace }}/build/python-coverage.data
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> ${{ github.env }}
docker_image=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test
docker run -d -t --gpus "\"device=${GPU_DEVICES}\"" --name ${container_name} \
-v "/home/data/cfs:/home/data/cfs" \
-v "/home/data/cfs/.cache:/root/.cache" \
-v "/home/data/cfs/.ccache:/root/.ccache" \
-v "/dev/shm:/dev/shm" \
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-v ${{ github.workspace }}:/paddle \
-e CI_name \
-e BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e PADDLE_ROOT \
-e GIT_PR_ID \
-e CACHE_DIR \
-e CCACHE_DIR \
-e ci_scripts \
-e FLAGS_fraction_of_gpu_memory_to_use \
-e CTEST_PARALLEL_LEVEL \
-e WITH_GPU \
-e CUDA_ARCH_NAME \
-e WITH_AVX \
-e WITH_COVERAGE \
-e COVERALLS_UPLOAD \
-e PADDLE_VERSION \
-e WITH_DISTRIBUTE \
-e WITH_UNITY_BUILD \
-e PY_VERSION \
-e WITH_SHARED_PHI \
-e WITH_CINN \
-e INFERENCE_DEMO_INSTALL_DIR \
-e CCACHE_MAXSIZE \
-e CCACHE_LIMIT_MULTIPLE \
-e FLAGS_PIR_OPTEST \
-e ON_INFER \
-e COVERAGE_FILE \
-e GITHUB_TOKEN \
-e GITHUB_API_TOKEN \
-e CFS_DIR \
-e no_proxy \
-w /paddle --network host ${docker_image}
- name: Download paddle.tar.gz and update test branch
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
rm -rf * .[^.]*
set -e
echo "Downloading Paddle.tar.gz from cfs"
wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/Paddle.tar.gz --no-check-certificate
echo "Extracting Paddle.tar.gz"
tar --use-compress-program="pzstd -1" -xf Paddle.tar.gz --strip-components=1
rm Paddle.tar.gz
'
- name: Test
id: unit_test
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
source ${{ github.workspace }}/../../../proxy
pip install build//python/dist/*.whl --no-deps
pip install -r python/unittest_py/requirements.txt
bash $ci_scripts/h-test.sh
'
- name: FA Test
if: (success() || failure()) && steps.unit_test.conclusion != 'skipped'
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
source ${{ github.workspace }}/../../../proxy
cd test/test_flashmask_ci
bash run.sh
'
- name: Terminate and delete the container
if: always()
run: |
set +e
rm Paddle.tar.gz
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker stop ${{ env.container_name }}
docker rm ${{ env.container_name }}
fleet_single_card_test:
name: Fleet Unit test (single card)
needs: [build]
if: needs.build.outputs.can-skip != 'true'
runs-on:
group: Fleet-H-single-card
env:
PIP_CACHE_DIR: /root/.cache/pip
CACHE_DIR: /root/.cache
TASK: paddle-fleet-CI-${{ github.event.pull_request.number }}-single-card-test
steps:
- name: Determine the runner
run: |
gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV
- name: Check docker image and run container
env:
GPU_DEVICES: ${{ env.GPU_DEVICES }}
docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test"
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> ${{ github.env }}
docker pull $docker_image
docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
-v "/dev/shm:/dev/shm" \
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-v ${{ github.workspace }}/../../../proxy:/root/proxy \
-v /ssd1/paddle-1/action_cache:/root/.cache \
-v ${{ github.workspace }}:/paddle \
-e BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e PADDLE_ROOT \
-e ci_scripts \
-e CACHE_DIR \
-e no_proxy \
-e CI_name \
-e PIP_CACHE_DIR \
-e work_dir \
-e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-e GITHUB_REPO_NAME="${{ github.repository }}" \
-e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-e GITHUB_RUN_ID="${{ github.run_id }}" \
-w /paddle --network host ${docker_image}
- name: Clone PaddleFleet
run: |
docker exec -t ${{ env.container_name }} /bin/bash -ce '
rm -rf * .[^.]*
source /root/proxy
git clone https://github.com/PaddlePaddle/PaddleFleet.git .
git config --global --add safe.directory /paddle
git config user.name "PaddleCI"
git config user.email "paddle_ci@example.com"
git config pull.rebase false
mkdir -p /root/.cache/pip
pip cache dir
echo "Install uv"
pip install uv
echo "uv sync"
git submodule update --init --recursive
uv sync --group ci -v > /dev/null
'
- name: Download paddle.tar.gz and install paddle whl
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
set -e
mkdir -p /PaddlePaddle
cd /PaddlePaddle
echo "Downloading Paddle.tar.gz from cfs"
wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate
source /root/proxy
source /paddle/.venv/bin/activate
export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle.
export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run.
export UV_HTTP_TIMEOUT=300
uv pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall
'
- name: Single card test
run: |
docker exec -t ${{ env.container_name }} /bin/bash -xce '
pwd
source .venv/bin/activate
export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle.
export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run.
export UV_HTTP_TIMEOUT=300
python -c "import paddle; print(paddle.version.commit)"
bash ci/single_card_test.sh
single_card_exit_code=$?
if [[ "$single_card_exit_code" != "0" ]]; then
echo -e "::error:: \033[31mSingle card test failed.\033[0m"
exit 1
else
echo -e "\033[32mSingle card test succeeded.\033[0m"
fi
'
- name: Terminate and delete the container
if: ${{ always() }}
run: |
set +e
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker rm -f ${{ env.container_name }}
fleet-multi-card_test:
name: Fleet Unit test (multi-card)
needs: [build]
if: needs.build.outputs.can-skip != 'true'
runs-on:
group: Fleet-H-multi-card
env:
PIP_CACHE_DIR: /root/.cache/pip
TASK: paddle-fleet-CI-${{ github.event.pull_request.number }}-multi-card_test
docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test"
steps:
- name: Check docker image and run container
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> ${{ github.env }}
docker pull $docker_image
docker run -d -t --gpus all --name ${container_name} \
-v "/dev/shm:/dev/shm" \
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-v ${{ github.workspace }}/../../../proxy:/root/proxy \
-v ${{ github.workspace }}/../../../.cache:/root/.cache \
-v ${{ github.workspace }}:/paddle \
-e BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e PADDLE_ROOT \
-e ci_scripts \
-e CACHE_DIR \
-e no_proxy \
-e CI_name \
-e PIP_CACHE_DIR \
-e work_dir \
-e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-e GITHUB_REPO_NAME="${{ github.repository }}" \
-e GITHUB_EVENT_NAME="${{ github.event_name }}" \
-e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-e GITHUB_RUN_ID="${{ github.run_id }}" \
-w /paddle --network host ${docker_image}
- name: Clone PaddleFleet
run: |
docker exec -t ${{ env.container_name }} /bin/bash -ce '
rm -rf * .[^.]*
source /root/proxy
git clone https://github.com/PaddlePaddle/PaddleFleet.git .
git config --global --add safe.directory /paddle
git config user.name "PaddleCI"
git config user.email "paddle_ci@example.com"
git config pull.rebase false
mkdir -p /root/.cache/pip
pip cache dir
echo "Install uv"
pip install uv
echo "uv sync"
git submodule update --init --recursive
uv sync --group ci -v > /dev/null
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq
chmod +x /usr/local/bin/yq
'
- name: Download paddle.tar.gz and install paddle whl
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c '
set -e
mkdir -p /PaddlePaddle
cd /PaddlePaddle
echo "Downloading Paddle.tar.gz from cfs"
wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate
source /root/proxy
source /paddle/.venv/bin/activate
export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle.
export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run.
export UV_HTTP_TIMEOUT=300
uv pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall
'
- name: Multi-card test
run: |
docker exec -t ${{ env.container_name }} /bin/bash -ce '
source /paddle/.venv/bin/activate
export PYTHONPATH=$(pwd)
python -c "import paddle; print(paddle.version.commit)"
export UV_SKIP_WHEEL_FILENAME_CHECK=1 #This environment variable allows installing the latest commit-level whl package of Paddle.
export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run.
export UV_HTTP_TIMEOUT=300
bash ci/multi-card_test.sh
multi_card_exit_code=$?
if [[ "$multi_card_exit_code" != "0" ]]; then
echo -e "::error:: \033[31mMulti card test failed.\033[0m"
exit 1
else
echo -e "\033[32mMulti card test succeeded.\033[0m"
fi
'
- name: Terminate and delete the container
if: ${{ always() }}
run: |
set +e
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker rm -f ${{ env.container_name }}