diff --git a/.azure-pipelines/docker/Dockerfile_xpu.devel b/.azure-pipelines/docker/Dockerfile_xpu.devel new file mode 100644 index 00000000000..982445501b0 --- /dev/null +++ b/.azure-pipelines/docker/Dockerfile_xpu.devel @@ -0,0 +1,50 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +ARG UBUNTU_VER=24.04 +FROM ubuntu:${UBUNTU_VER} + +# See http://bugs.python.org/issue19846 +ENV LANG C.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + ca-certificates \ + git \ + libomp-dev \ + numactl \ + time \ + wget \ + bc \ + jq \ + vim + +RUN apt-get install -y software-properties-common \ + && add-apt-repository -y ppa:kobuk-team/intel-graphics \ + && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc \ + && apt-get install -y intel-media-va-driver-non-free libmfx-gen1 libvpl2 libvpl-tools libva-glx2 va-driver-all vainfo \ + && apt-get install -y libze-dev intel-ocloc \ + && apt-get install -y libze-intel-gpu-raytracing + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv +RUN echo '#!/bin/sh\nexec /usr/local/bin/uv pip "$@"' > /usr/local/bin/pip && \ + chmod +x /usr/local/bin/pip +ARG USER_ID=1000 +ARG GROUP_ID=1000 + +RUN groupadd -g ${GROUP_ID} hostgroup && \ + useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser && \ + groupadd -g 991 render && \ + usermod -aG render hostuser + +USER hostuser + +ENV PATH="/home/hostuser/.venv/bin:$PATH" +ENV VIRTUAL_ENV="/home/hostuser/.venv" +ENV UV_NO_PROGRESS=1 \ + UV_LINK_MODE=copy + +RUN uv venv --python=3.12 /home/hostuser/.venv +RUN which python && python --version + +WORKDIR /home/hostuser diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh index 7ca9ed1707c..54bfa517607 100644 --- a/.azure-pipelines/scripts/install_nc.sh +++ b/.azure-pipelines/scripts/install_nc.sh @@ -1,25 +1,28 @@ #!/bin/bash - +set -x echo -e "##[group]Install Neural Compressor ... " cd /neural-compressor + if [[ $1 = *"3x_pt"* ]]; then - python -m pip install --no-cache-dir -r requirements_pt.txt - if [[ $1 = *"3x_pt_fp8"* ]]; then + pip install --no-cache-dir -r requirements_pt.txt + if [[ $1 = *"hpu"* ]]; then pip uninstall neural_compressor_3x_pt -y || true - python setup.py pt bdist_wheel + elif [[ $1 = *"xpu"* ]]; then + echo -e "\n Install torch XPU ... " + pip install torch==2.9.1 torchvision --index-url https://download.pytorch.org/whl/xpu + pip install auto-round-lib==0.10.2.1 # mapping torch and auto-round version else echo -e "\n Install torch CPU ... " pip install torch==2.9.1 torchvision --index-url https://download.pytorch.org/whl/cpu pip install auto-round-lib==0.10.2.1 # mapping torch and auto-round version - python -m pip install --no-cache-dir -r requirements.txt - python setup.py bdist_wheel fi + python setup.py pt bdist_wheel pip install --no-deps dist/neural_compressor*.whl --force-reinstall elif [[ $1 = *"3x_tf"* ]]; then pip install tensorflow==2.19.0 python -m pip install --no-cache-dir -r requirements.txt python -m pip install --no-cache-dir -r requirements_tf.txt - python setup.py bdist_wheel + python setup.py tf bdist_wheel pip install dist/neural_compressor*.whl --force-reinstall else python -m pip install --no-cache-dir -r requirements.txt @@ -27,6 +30,7 @@ else pip install dist/neural_compressor*.whl --force-reinstall fi -echo -e "\n pip list after install Neural Compressor ... " echo "##[endgroup]" + +echo -e "\n pip list after install Neural Compressor ... " pip list diff --git a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh index 16e1f2c9189..e5b18f87e40 100644 --- a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh +++ b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh @@ -52,8 +52,7 @@ else TORCH_VISION_VERSION=${torchvision_version} fi - -/bin/bash run_model_trigger_common.sh \ +numactl --physcpubind="${NUMA_CPUSET:-0-15}" --membind="${NUMA_NODE:-0}" bash run_model_trigger_common.sh \ --yaml=${yaml} \ --framework=${FRAMEWORK} \ --fwk_ver=${FRAMEWORK_VERSION} \ diff --git a/.azure-pipelines/scripts/ut/collect_log.sh b/.azure-pipelines/scripts/ut/collect_log.sh deleted file mode 100644 index 9e34e7dac07..00000000000 --- a/.azure-pipelines/scripts/ut/collect_log.sh +++ /dev/null @@ -1,139 +0,0 @@ -source /neural-compressor/.azure-pipelines/scripts/change_color.sh - -pip install coverage -export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file -coverage_log="/neural-compressor/log_dir/coverage_log" -coverage_log_base="/neural-compressor/log_dir/coverage_log_base" -coverage_compare="/neural-compressor/log_dir/coverage_compare.html" -cd /neural-compressor/log_dir - -$BOLD_YELLOW && echo "##[group]collect coverage for PR branch" && $RESET -mkdir -p coverage_PR -cp ut_*_coverage/.coverage.* ./coverage_PR/ - -cd coverage_PR -coverage combine --keep --rcfile=${COVERAGE_RCFILE} -cp .coverage /neural-compressor/.coverage -cd /neural-compressor -coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log} -coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE} -coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE} -ls -l log_dir/coverage_PR/htmlcov - -cd /neural-compressor -cp -r /neural-compressor/.azure-pipelines .azure-pipelines-pr -git config --global --add safe.directory /neural-compressor -git fetch -git checkout master -rm -rf build dist *egg-info -echo y | pip uninstall neural-compressor -cd /neural-compressor/.azure-pipelines-pr/scripts && bash install_nc.sh -echo "##[endgroup]" - -$BOLD_YELLOW && echo "##[group]collect coverage for baseline" && $RESET -coverage erase -cd /neural-compressor/log_dir -mkdir -p coverage_base -cp ut-base_*_coverage/.coverage.* ./coverage_base/ - -cd coverage_base -coverage combine --keep --rcfile=${COVERAGE_RCFILE} -cp .coverage /neural-compressor/.coverage -cd /neural-compressor -coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log_base} -coverage html -d log_dir/coverage_base/htmlcov --rcfile=${COVERAGE_RCFILE} -coverage xml -o log_dir/coverage_base/coverage.xml --rcfile=${COVERAGE_RCFILE} -ls -l log_dir/coverage_base/htmlcov -echo "##[endgroup]" - -get_coverage_data() { - # Input argument - local coverage_xml="$1" - - # Get coverage data - local coverage_data=$(python3 -c "import xml.etree.ElementTree as ET; root = ET.parse('$coverage_xml').getroot(); print(ET.tostring(root).decode())") - if [[ -z "$coverage_data" ]]; then - echo "Failed to get coverage data from $coverage_xml." - exit 1 - fi - - # Get lines coverage - local lines_covered=$(echo "$coverage_data" | grep -o 'lines-covered="[0-9]*"' | cut -d '"' -f 2) - local lines_valid=$(echo "$coverage_data" | grep -o 'lines-valid="[0-9]*"' | cut -d '"' -f 2) - if [ $lines_valid == 0 ]; then - local lines_coverage=0 - else - local lines_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $lines_covered / $lines_valid}") - fi - - # Get branches coverage - local branches_covered=$(echo "$coverage_data" | grep -o 'branches-covered="[0-9]*"' | cut -d '"' -f 2) - local branches_valid=$(echo "$coverage_data" | grep -o 'branches-valid="[0-9]*"' | cut -d '"' -f 2) - if [ $branches_valid == 0 ]; then - local branches_coverage=0 - else - local branches_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $branches_covered/$branches_valid}") - fi - - # Return values - echo "$lines_covered $lines_valid $lines_coverage $branches_covered $branches_valid $branches_coverage" -} - -$BOLD_YELLOW && echo "compare coverage" && $RESET - -coverage_PR_xml="log_dir/coverage_PR/coverage.xml" -coverage_PR_data=$(get_coverage_data $coverage_PR_xml) -read lines_PR_covered lines_PR_valid coverage_PR_lines_rate branches_PR_covered branches_PR_valid coverage_PR_branches_rate <<<"$coverage_PR_data" - -coverage_base_xml="log_dir/coverage_base/coverage.xml" -coverage_base_data=$(get_coverage_data $coverage_base_xml) -read lines_base_covered lines_base_valid coverage_base_lines_rate branches_base_covered branches_base_valid coverage_base_branches_rate <<<"$coverage_base_data" - -$BOLD_BLUE && echo "PR lines coverage: $lines_PR_covered/$lines_PR_valid ($coverage_PR_lines_rate%)" && $RESET -$BOLD_BLUE && echo "PR branches coverage: $branches_PR_covered/$branches_PR_valid ($coverage_PR_branches_rate%)" && $RESET -$BOLD_BLUE && echo "BASE lines coverage: $lines_base_covered/$lines_base_valid ($coverage_base_lines_rate%)" && $RESET -$BOLD_BLUE && echo "BASE branches coverage: $branches_base_covered/$branches_base_valid ($coverage_base_branches_rate%)" && $RESET - -$BOLD_YELLOW && echo "clear upload path" && $RESET -rm -fr log_dir/coverage_PR/.coverage* -rm -fr log_dir/coverage_base/.coverage* -rm -fr log_dir/ut-coverage-* - -# Declare an array to hold failed items -declare -a fail_items=() - -if (( $(bc -l <<< "${coverage_PR_lines_rate}+0.05 < ${coverage_base_lines_rate}") )); then - fail_items+=("lines") -fi -if (( $(bc -l <<< "${coverage_PR_branches_rate}+0.05 < ${coverage_base_branches_rate}") )); then - fail_items+=("branches") -fi - -if [[ ${#fail_items[@]} -ne 0 ]]; then - fail_items_str=$( - IFS=', ' - echo "${fail_items[*]}" - ) - for item in "${fail_items[@]}"; do - case "$item" in - lines) - decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_lines_rate - $coverage_base_lines_rate" | bc -l))) - ;; - branches) - decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_branches_rate - $coverage_base_branches_rate" | bc -l))) - ;; - *) - echo "Unknown item: $item" - continue - ;; - esac - $BOLD_RED && echo "Unit Test failed with ${item} coverage decrease ${decrease}%" && $RESET - done - $BOLD_RED && echo "compare coverage to give detail info" && $RESET - bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "FAILED" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate} - exit 1 -else - $BOLD_GREEN && echo "Unit Test success with coverage lines: ${coverage_PR_lines_rate}%, branches: ${coverage_PR_branches_rate}%" && $RESET - $BOLD_GREEN && echo "compare coverage to give detail info" && $RESET - bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "SUCCESS" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate} -fi diff --git a/.azure-pipelines/scripts/ut/coverage.3x_pt_fp8 b/.azure-pipelines/scripts/ut/coverage.3x_pt_hpu similarity index 100% rename from .azure-pipelines/scripts/ut/coverage.3x_pt_fp8 rename to .azure-pipelines/scripts/ut/coverage.3x_pt_hpu diff --git a/.azure-pipelines/scripts/ut/coverage.file b/.azure-pipelines/scripts/ut/coverage.file deleted file mode 100644 index b9dea8ecb02..00000000000 --- a/.azure-pipelines/scripts/ut/coverage.file +++ /dev/null @@ -1,30 +0,0 @@ -[run] -branch = True - -[report] -omit = - */**/fake*yaml - */**/fake.py - */neural_compressor/model/nets_factory.py - */neural_compressor/benchmark.py - */neural_compressor/experimental/benchmark.py - */neural_compressor/contrib/strategy/tpe.py - */intel_extension_for_transformers/backends/* - */intel_extension_for_transformers/optimization/utils/get_throughput.py - */neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_decomposed_in.py - */neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_in.py - */neural_compressor/adaptor/tf_utils/graph_rewriter/int8/freeze_value.py - */neural_compressor/template/* - */neural_compressor/common/* - */neural_compressor/torch/* - */neural_compressor/tensorflow/* -exclude_lines = - pragma: no cover - raise NotImplementedError - raise TypeError - if self.device == "gpu": - if device == "gpu": - except ImportError: - except Exception as e: - onnx_version < ONNX18_VERSION - onnx_version >= ONNX18_VERSION diff --git a/.azure-pipelines/scripts/ut/run_3x_pt.sh b/.azure-pipelines/scripts/ut/run_3x_pt.sh index 6e01a70ea23..9be42675806 100644 --- a/.azure-pipelines/scripts/ut/run_3x_pt.sh +++ b/.azure-pipelines/scripts/ut/run_3x_pt.sh @@ -26,6 +26,8 @@ cd /neural-compressor/test || exit 1 rm -rf torch/algorithms/fp8_quant rm -rf torch/quantization/fp8_quant rm -rf torch/algorithms/fp8_quant_xpu +rm -rf torch/quantization/test_autoround_xpu.py +rm -rf torch/quantization/test_autoround_hpu.py LOG_DIR=/neural-compressor/log_dir mkdir -p ${LOG_DIR} @@ -37,7 +39,9 @@ numactl --physcpubind="${NUMA_CPUSET:-0-15}" --membind="${NUMA_NODE:-0}" bash ru cp report.html ${LOG_DIR}/ -if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then +set -x +if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \ +[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then echo "Find errors in pytest case, please check the output..." echo "Please search for '== FAILURES ==' or '== ERRORS =='" exit 1 diff --git a/.azure-pipelines/scripts/ut/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh similarity index 87% rename from .azure-pipelines/scripts/ut/run_3x_pt_fp8.sh rename to .azure-pipelines/scripts/ut/run_3x_pt_hpu.sh index bb9950e1f9d..53feeebba88 100644 --- a/.azure-pipelines/scripts/ut/run_3x_pt_fp8.sh +++ b/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh @@ -23,16 +23,16 @@ pip install beautifulsoup4==4.13.5 echo "##[endgroup]" pip list -export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt_fp8 +export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt_hpu inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])') cd /neural-compressor/test || exit 1 LOG_DIR=/neural-compressor/log_dir mkdir -p ${LOG_DIR} -ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log +ut_log_name=${LOG_DIR}/ut_3x_pt_hpu.log pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name} pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name} -pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/test_autoround.py 2>&1 | tee -a ${ut_log_name} +pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/test_autoround_hpu.py 2>&1 | tee -a ${ut_log_name} # Below folder contains some special configuration for pytest so we need to enter the path and run it separately cd /neural-compressor/test/torch/algorithms/fp8_quant @@ -50,7 +50,9 @@ mkdir -p report && mv *.html report pytest_html_merger -i ./report -o ./report.html cp report.html ${LOG_DIR}/ -if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then +set -x +if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \ +[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then echo "Find errors in pytest case, please check the output..." echo "Please search for '== FAILURES ==' or '== ERRORS =='" exit 1 diff --git a/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh b/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh new file mode 100644 index 00000000000..6577b9ebd3d --- /dev/null +++ b/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh @@ -0,0 +1,53 @@ +#!/bin/bash +python -c "import neural_compressor as nc" +test_case="run 3x Torch with XPU" +echo "${test_case}" + +echo "##[section]Run import check" +set -e +python -c "import neural_compressor.torch" +python -c "import neural_compressor.common" +echo "##[section]import check pass" + +echo "##[group]set up UT env..." +export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH +uv pip install -r /neural-compressor/test/torch/requirements_xpu.txt +uv pip install pytest-cov pytest-html +uv pip list +echo "##[endgroup]" + +echo "##[group]check xpu env..." +echo "ZE_AFFINITY_MASK: ${ZE_AFFINITY_MASK}" +python - <<'PY' +import torch +print("torch:", torch.__version__) +print("xpu available:", torch.xpu.is_available()) +print("xpu count:", torch.xpu.device_count()) +PY +echo "##[endgroup]" + +export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt +inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])') +cd /neural-compressor/test || exit 1 + +LOG_DIR=/neural-compressor/log_dir +mkdir -p ${LOG_DIR} +ut_log_name=${LOG_DIR}/ut_3x_pt_xpu.log + +find ./torch -name "test_autoround_xpu.py" | sed "s,\.\/,python -m pytest --cov=\"${inc_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_xpu.sh +cat run_xpu.sh +numactl --physcpubind="${NUMA_CPUSET:-0-27}" --membind="${NUMA_NODE:-0}" bash run_xpu.sh 2>&1 | tee ${ut_log_name} + +cp report.html ${LOG_DIR}/ + +set -x +if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \ +[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then + echo "##[error]Find errors in pytest case, please check the output..." + exit 1 +fi + +# if ut pass, collect the coverage file into artifacts +cp .coverage ${LOG_DIR}/.coverage + +echo "UT finished successfully! " \ No newline at end of file diff --git a/.azure-pipelines/scripts/ut/run_3x_tf.sh b/.azure-pipelines/scripts/ut/run_3x_tf.sh index 79cb71ecd81..c2eb07dd724 100644 --- a/.azure-pipelines/scripts/ut/run_3x_tf.sh +++ b/.azure-pipelines/scripts/ut/run_3x_tf.sh @@ -64,7 +64,9 @@ pytest_html_merger -i ./report -o ./report.html cp report.html ${LOG_DIR}/ -if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then +set -x +if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \ +[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then echo "Find errors in pytest case, please check the output..." echo "Please search for '== FAILURES ==' or '== ERRORS =='" exit 1 diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml index 521cac1eada..e9e805deda7 100644 --- a/.azure-pipelines/template/docker-template.yml +++ b/.azure-pipelines/template/docker-template.yml @@ -21,18 +21,6 @@ parameters: default: "build" steps: - - task: Bash@3 - inputs: - targetType: "inline" - script: | - docker ps -a - if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then - docker start $(docker ps -aq --filter "name=${{ parameters.containerName }}") - echo "remove left files through container ..." - docker exec ${{ parameters.containerName }} bash -c "ls -a /neural-compressor && rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* && ls -a /neural-compressor || true" - fi - displayName: "Docker workspace clean up" - - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}: - script: | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true diff --git a/.azure-pipelines/template/model-template.yml b/.azure-pipelines/template/model-template.yml index 9c9de9ba796..1186e04865e 100644 --- a/.azure-pipelines/template/model-template.yml +++ b/.azure-pipelines/template/model-template.yml @@ -22,7 +22,8 @@ steps: containerName: ${{ parameters.modelContainerName }} - script: | - docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ + docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \ + bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='env_setup'" displayName: Env setup @@ -41,18 +42,20 @@ steps: displayName: "Download refer logs" - script: | - docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ + docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \ + bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='tuning'" displayName: Quantization - ${{ if ne(parameters.APIVersion, '3x') }}: - script: | - docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ + docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \ + bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='int8_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)" displayName: INT8 Benchmark - script: | - docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ + docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='fp32_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)" displayName: FP32 Benchmark diff --git a/.azure-pipelines/template/ut-template.yml b/.azure-pipelines/template/ut-template.yml index bb00c7ea3c0..dcdacb0d7fb 100644 --- a/.azure-pipelines/template/ut-template.yml +++ b/.azure-pipelines/template/ut-template.yml @@ -25,7 +25,7 @@ parameters: default: "coverage" - name: utContainerName type: string - default: "utTest" + default: "INCUnitTest" - name: imageSource type: string default: "build" @@ -42,7 +42,7 @@ steps: imageSource: ${{ parameters.imageSource }} - script: | - docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.utContainerName }} \ + docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} -e ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK} ${{ parameters.utContainerName }} \ bash -c "cd /neural-compressor/.azure-pipelines/scripts \ && bash install_nc.sh ${{ parameters.utScriptFileName }} \ && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}" diff --git a/.azure-pipelines/ut-3x-pt-fp8.yml b/.azure-pipelines/ut-3x-pt-hpu.yml similarity index 90% rename from .azure-pipelines/ut-3x-pt-fp8.yml rename to .azure-pipelines/ut-3x-pt-hpu.yml index 09c3c1d1322..4ac3b709c01 100644 --- a/.azure-pipelines/ut-3x-pt-fp8.yml +++ b/.azure-pipelines/ut-3x-pt-hpu.yml @@ -8,11 +8,11 @@ pr: - master paths: include: - - .azure-pipelines/scripts/ut/run_3x_pt_fp8.sh + - .azure-pipelines/scripts/ut/run_3x_pt_hpu.sh - .azure-pipelines/scripts/install_nc.sh - .azure-pipelines/ut-3x-pt-fp8.yml - .azure-pipelines/template/docker-template.yml - - .azure-pipelines/scripts/ut/coverage.3x_pt_fp8 + - .azure-pipelines/scripts/ut/coverage.3x_pt_hpu - neural_compressor/common - neural_compressor/torch - neural_compressor/transformers @@ -30,7 +30,7 @@ variables: IMAGE_TAG: "py310" UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir - ARTIFACT_NAME: "UT_coverage_report_3x_pt_fp8" + ARTIFACT_NAME: "UT_coverage_report_3x_pt_hpu" REPO: $(Build.Repository.Uri) stages: @@ -46,7 +46,7 @@ stages: parameters: imageSource: "pull" dockerConfigName: "commonDockerConfig" - utScriptFileName: "run_3x_pt_fp8" + utScriptFileName: "run_3x_pt_hpu" uploadPath: $(UPLOAD_PATH) utArtifact: "ut_3x" @@ -62,7 +62,7 @@ stages: parameters: imageSource: "pull" dockerConfigName: "gitCloneDockerConfig" - utScriptFileName: "run_3x_pt_fp8" + utScriptFileName: "run_3x_pt_hpu" uploadPath: $(UPLOAD_PATH) utArtifact: "ut_3x_baseline" @@ -90,7 +90,7 @@ stages: pip install -U pip setuptools python setup.py install pt cd ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts - bash ut/collect_log_3x.sh 3x_pt_fp8 + bash ut/collect_log_3x.sh 3x_pt_hpu displayName: "Collect UT Coverage" - task: PublishCodeCoverageResults@2 diff --git a/.azure-pipelines/ut-3x-pt-xpu.yml b/.azure-pipelines/ut-3x-pt-xpu.yml new file mode 100644 index 00000000000..5d7239d421b --- /dev/null +++ b/.azure-pipelines/ut-3x-pt-xpu.yml @@ -0,0 +1,46 @@ +trigger: none + +pr: + autoCancel: true + drafts: false + branches: + include: + - master + paths: + include: + - neural_compressor/common + - neural_compressor/torch + - test/torch/quantization/test_autoround_xpu.py + - setup.py + - requirements_pt.txt + - .azure-pipelines/ut-3x-pt-xpu.yml + - .azure-pipelines/template/docker-template.yml + - .azure-pipelines/scripts/install_nc.sh + - .azure-pipelines/scripts/ut/run_3x_pt_xpu.sh + +pool: B60 + +variables: + IMAGE_NAME: "neural-compressor" + IMAGE_TAG: "py312-xpu" + DOCKERFILE_NAME: "Dockerfile_xpu" + UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir + REPO: $(Build.Repository.Uri) + +stages: + - stage: Torch + displayName: Unit Test 3x Torch + dependsOn: [] + jobs: + - job: + displayName: Unit Test 3x Torch + steps: + - template: template/ut-template.yml + parameters: + dockerConfigName: "commonDockerConfig" + dockerFileName: $(DOCKERFILE_NAME) + repoTag: "$(IMAGE_TAG)" + utScriptFileName: "run_3x_pt_xpu" + uploadPath: $(UPLOAD_PATH) + utArtifact: "ut_3x_xpu" + utContainerName: "INCUnitTest$(NODE_LABEL)" diff --git a/.azure-pipelines/ut-3x-pt.yml b/.azure-pipelines/ut-3x-pt.yml index 67701b53f92..24b53ad633c 100644 --- a/.azure-pipelines/ut-3x-pt.yml +++ b/.azure-pipelines/ut-3x-pt.yml @@ -47,7 +47,7 @@ stages: utScriptFileName: "run_3x_pt" uploadPath: $(UPLOAD_PATH) utArtifact: "ut_3x" - utContainerName: "utTest$(NODE_LABEL)" + utContainerName: "INCUnitTest$(NODE_LABEL)" - stage: Torch_baseline diff --git a/.azure-pipelines/ut-3x-tf.yml b/.azure-pipelines/ut-3x-tf.yml index 13a404bd465..ab13932e707 100644 --- a/.azure-pipelines/ut-3x-tf.yml +++ b/.azure-pipelines/ut-3x-tf.yml @@ -42,6 +42,7 @@ stages: utScriptFileName: "run_3x_tf" uploadPath: $(UPLOAD_PATH) utArtifact: "ut_3x" + utContainerName: "INCUnitTest$(NODE_LABEL)" - stage: TensorFlow_baseline displayName: Unit Test 3x TensorFlow baseline @@ -58,6 +59,7 @@ stages: uploadPath: $(UPLOAD_PATH) utArtifact: "ut_3x_baseline" repo: $(REPO) + utContainerName: "INCUnitTest$(NODE_LABEL)" - stage: Coverage displayName: "Coverage Compare" diff --git a/requirements_pt.txt b/requirements_pt.txt index 5f4518f9332..d908a1f87f1 100644 --- a/requirements_pt.txt +++ b/requirements_pt.txt @@ -1,4 +1,5 @@ numpy +packaging prettytable psutil py-cpuinfo diff --git a/test/torch/quantization/test_autoround.py b/test/torch/quantization/test_autoround_cpu.py similarity index 65% rename from test/torch/quantization/test_autoround.py rename to test/torch/quantization/test_autoround_cpu.py index 85be4b65cc6..3dafc565f39 100644 --- a/test/torch/quantization/test_autoround.py +++ b/test/torch/quantization/test_autoround_cpu.py @@ -6,44 +6,8 @@ import pytest import torch import transformers -from packaging.version import Version, parse -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig - - -@lru_cache(None) -def is_habana_framework_installed(): - """Check if Habana framework is installed. - - Only check for the habana_frameworks package without importing it to avoid - initializing lazy-mode-related components. - """ - from importlib.util import find_spec - - package_spec = find_spec("habana_frameworks") - return package_spec is not None - - -def set_hpu_torch_compile_envs(): - if not is_habana_framework_installed(): - return None - import torch._dynamo.config as dynamo_config - import torch._inductor.config as inductor_config - - os.environ["PT_HPU_LAZY_MODE"] = "0" - os.environ["PT_ENABLE_INT64_SUPPORT"] = "1" - inductor_config.force_disable_caches = True - dynamo_config.inline_inbuilt_nn_modules = True - - -# The `TestAutoRoundHPU` is expected to be run with `compile` mode, -# so set the HPU environment variables before importing INC. -if is_habana_framework_installed(): - set_hpu_torch_compile_envs() - - -def is_xpu_available(): - return torch.xpu.is_available() - +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer from neural_compressor.torch.quantization import ( AutoRoundConfig, @@ -85,7 +49,6 @@ def run_fn(model, dataloader): model(data) -@pytest.mark.skipif(is_habana_framework_installed(), reason="These tests are not supported on HPU for now.") @pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") class TestAutoRoundCPU: @classmethod @@ -671,306 +634,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" shutil.rmtree(output_dir, ignore_errors=True) - - -@pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed") -@pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled") -@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") -class TestAutoRoundHPU: - @classmethod - def setup_class(self): - - model_name = "TheBloke/Llama-2-7B-Chat-GPTQ" - from neural_compressor.torch.algorithms.autoround import get_dataloader - - config = LlamaConfig(num_hidden_layers=2) - with transformers.modeling_utils.no_init_weights(): - self.tiny_llama_model = AutoModelForCausalLM.from_config(config=config) - - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10) - self.inp = torch.ones([1, 10], dtype=torch.long) - self.label = self.tiny_llama_model(self.inp)[0] - - @classmethod - def teardown_class(self): - shutil.rmtree("saved_results", ignore_errors=True) - - def setup_method(self, method): - torch.compiler.reset() - logger.info(f"Running TestAutoRound test: {method.__name__}") - - @pytest.mark.skip(reason="Disabled, see JIRA: https://jira.habana-labs.com/browse/SW-227554") - def test_autoround_w4a8(self): - fp32_model = copy.deepcopy(self.tiny_llama_model) - quant_config = AutoRoundConfig( - nsamples=32, - seqlen=10, - iters=2, - scale_dtype="bf16", - dtype="fp8_to_int_sym", - act_bits=8, - act_group_size=-1, - act_dtype="fp8_sym", - act_dynamic=False, - ) - - quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) - logger.info(f"Test AutoRound with config {quant_config}") - - # prepare + convert API - model = prepare(model=fp32_model, quant_config=quant_config) - - run_fn(model, self.dataloader) - q_model = convert(model) - assert q_model is not None, "Quantization failed!" - # We quantize the model with compile mode, if we want to run the model directly, - # we need use the compile mode as well. - # We can use the lazy mode but need to restart the python process. - from neural_compressor.torch.algorithms.weight_only.save_load import load - - model = load( - model_name_or_path="temp_auto_round", - original_model=copy.deepcopy(self.tiny_llama_model), - device="hpu", - format="huggingface", - ) - print(f"loaded model {model}") - from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear - - has_hpu_mixed_precision_module = False - for name, module in model.named_modules(): - if isinstance(module, HPUMixedPrecisionLinear): - has_hpu_mixed_precision_module = True - break - assert has_hpu_mixed_precision_module, "loading compressed model failed." - model.eval() - model = model.to(torch.bfloat16) - model = torch.compile(model, backend="hpu_backend") - out = model(self.inp.to("hpu"))[0] - print(f"out: {out}") - assert out is not None, "Loading compressed model failed." - - def test_quant_lm_head(self): - model = transformers.AutoModelForCausalLM.from_pretrained( - "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM" - ) - tokenizer = AutoTokenizer.from_pretrained( - "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM", trust_remote_code=True - ) - - quant_config = AutoRoundConfig( - tokenizer=tokenizer, - nsamples=32, - seqlen=10, - iters=1, - amp=False, - scale_dtype="fp32", - quant_lm_head=True, - group_size=32, - ) - logger.info(f"Test AutoRound with config {quant_config}") - text = "Replace me by any text you'd like." - encoded_input = tokenizer(text, return_tensors="pt") - model = prepare(model=model, quant_config=quant_config) - q_model = convert(model) - output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0]) - print(output) - assert output is not None - assert q_model.lm_head.__class__.__name__ in tagert_modules, "packing model failed." - - def test_int4_dtype(self): - fp32_model = copy.deepcopy(self.tiny_llama_model) - quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=1, amp=False, scale_dtype="fp32") - logger.info(f"Test AutoRound with config {quant_config}") - - # prepare + convert API - model = prepare(model=fp32_model, quant_config=quant_config) - - run_fn(model, self.dataloader) - q_model = convert(model) - _ = q_model(self.inp) # inference - assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed." - - def test_autoround_with_quantize_API(self): - fp32_model = copy.deepcopy(self.tiny_llama_model) - - quant_config = AutoRoundConfig(scheme="W4A16", seqlen=10, iters=1, use_sym=False, amp=False, scale_dtype="fp32") - logger.info(f"Test AutoRound with config {quant_config}") - - # quantize API - q_model = quantize( - model=fp32_model, - quant_config=quant_config, - run_fn=run_fn, - run_args=(self.dataloader,), - ) - _ = q_model(self.inp) # inference - tagert_modules = ["WQLinear_GEMM"] - assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed." - - -@pytest.mark.skipif(not is_xpu_available(), reason="These tests are not supported on XPU for now.") -@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") -class TestAutoRoundGPU: - @pytest.mark.parametrize( - "scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"] - ) - def test_scheme(self, scheme): - # INC API - from transformers import AutoModelForCausalLM, AutoTokenizer - - fp32_model = AutoModelForCausalLM.from_pretrained( - "facebook/opt-125m", - ) - inp = torch.ones([1, 10], dtype=torch.long) - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) - - output_dir = "./saved_inc" - quant_config = AutoRoundConfig( - tokenizer=tokenizer, - nsamples=32, - seqlen=10, - iters=1, - device_map="xpu", - scheme=scheme, - export_format="auto_round", - output_dir=output_dir, # default is "temp_auto_round" - ) - - # quantizer execute - model = prepare(model=fp32_model, quant_config=quant_config) - inc_model = convert(model) - if scheme in ["FPW8A16"]: # FPW8A16 loading not supported yet - return - inc_model = AutoModelForCausalLM.from_pretrained( - output_dir, - ) - out = inc_model(inp)[0] - - # AutoRound API - from transformers import AutoModelForCausalLM, AutoTokenizer - - fp32_model = transformers.AutoModelForCausalLM.from_pretrained( - "facebook/opt-125m", - ) - inp = torch.ones([1, 10], dtype=torch.long) - tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) - from auto_round import AutoRound - - ar = AutoRound( - model=fp32_model, - tokenizer=tokenizer, - nsamples=32, - seqlen=10, - iters=1, - device_map="xpu", - scheme=scheme, - ) - quantized_model_path = "./saved_ar" - ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") - model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, - ) - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - out_ar = model(inp)[0] - assert torch.all(out_ar.eq(out)) - shutil.rmtree(output_dir, ignore_errors=True) - shutil.rmtree(quantized_model_path, ignore_errors=True) - - @pytest.mark.parametrize("format", ["auto_awq", "auto_gptq", "llm_compressor"]) - def test_format(self, format): - # INC API - scheme = "W4A16" if format != "llm_compressor" else "MXFP4" - from transformers import AutoModelForCausalLM, AutoTokenizer - - fp32_model = AutoModelForCausalLM.from_pretrained( - "facebook/opt-125m", - ) - inp = torch.ones([1, 10], dtype=torch.long) - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) - - output_dir = "./saved_inc" - quant_config = AutoRoundConfig( - tokenizer=tokenizer, - nsamples=32, - seqlen=10, - iters=1, - device_map="xpu", - scheme=scheme, - export_format=format, - output_dir=output_dir, # default is "temp_auto_round" - ) - - # quantizer execute - model = prepare(model=fp32_model, quant_config=quant_config) - inc_model = convert(model) - assert inc_model is not None - shutil.rmtree(output_dir, ignore_errors=True) - - def test_vlm_model(self): - # INC API - scheme = "W4A16" - model_name = "Qwen/Qwen2-VL-2B-Instruct" - from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration - - fp32_model = Qwen2VLForConditionalGeneration.from_pretrained( - "Qwen/Qwen2-VL-2B-Instruct", - ) - tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True) - from neural_compressor.torch.algorithms.autoround import get_mllm_dataloader - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - - output_dir = "./saved_inc" - quant_config = AutoRoundConfig( - tokenizer=tokenizer, - nsamples=1, - iters=1, - seqlen=10, - # quant_nontext_module=True, - processor=processor, - device_map="xpu:0", - scheme=scheme, - export_format="auto_round", - output_dir=output_dir, # default is "temp_auto_round" - ) - - # quantizer execute - model = prepare(model=fp32_model, quant_config=quant_config) - inc_model = convert(model) - inc_model = Qwen2VLForConditionalGeneration.from_pretrained( - output_dir, - ) - assert inc_model is not None - shutil.rmtree(output_dir, ignore_errors=True) - - def test_quant_lm_head(self): - # INC API - scheme = "W4A16" - model_name = "Qwen/Qwen3-8B" - from transformers import AutoModelForCausalLM, AutoTokenizer - - fp32_model = AutoModelForCausalLM.from_pretrained( - model_name, - ) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - - output_dir = "./saved_inc" - quant_config = AutoRoundConfig( - tokenizer=tokenizer, - nsamples=1, - seqlen=10, - iters=0, # rtn - device_map="xpu", - scheme=scheme, - export_format="auto_round", - output_dir=output_dir, # default is "temp_auto_round" - quant_lm_head=True, - ) - - # quantizer execute - model = prepare(model=fp32_model, quant_config=quant_config) - inc_model = convert(model) - assert inc_model is not None - shutil.rmtree(output_dir, ignore_errors=True) diff --git a/test/torch/quantization/test_autoround_hpu.py b/test/torch/quantization/test_autoround_hpu.py new file mode 100644 index 00000000000..0b2039ee736 --- /dev/null +++ b/test/torch/quantization/test_autoround_hpu.py @@ -0,0 +1,209 @@ +import copy +import os +import shutil +from functools import lru_cache + +import pytest +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig + + +@lru_cache(None) +def is_habana_framework_installed(): + """Check if Habana framework is installed. + + Only check for the habana_frameworks package without importing it to avoid + initializing lazy-mode-related components. + """ + from importlib.util import find_spec + + package_spec = find_spec("habana_frameworks") + return package_spec is not None + + +def set_hpu_torch_compile_envs(): + if not is_habana_framework_installed(): + return None + import torch._dynamo.config as dynamo_config + import torch._inductor.config as inductor_config + + os.environ["PT_HPU_LAZY_MODE"] = "0" + os.environ["PT_ENABLE_INT64_SUPPORT"] = "1" + inductor_config.force_disable_caches = True + dynamo_config.inline_inbuilt_nn_modules = True + + +# The `TestAutoRoundHPU` is expected to be run with `compile` mode, +# so set the HPU environment variables before importing INC. +if is_habana_framework_installed(): + set_hpu_torch_compile_envs() + + +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, + quantize, +) +from neural_compressor.torch.utils import logger + +torch.backends.__allow_nonbracketed_mutation_flag = True + +try: + import auto_round + + auto_round_installed = True +except ImportError: + auto_round_installed = False + + +tagert_modules = ["QuantLinear", "QuantLinearGPTQ", "QuantLinearAWQ"] + + +@torch.no_grad() +def run_fn(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) + + +@pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed") +@pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled") +@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") +class TestAutoRoundHPU: + @classmethod + def setup_class(cls): + + model_name = "TheBloke/Llama-2-7B-Chat-GPTQ" + from neural_compressor.torch.algorithms.autoround import get_dataloader + + config = LlamaConfig(num_hidden_layers=2) + with transformers.modeling_utils.no_init_weights(): + cls.tiny_llama_model = AutoModelForCausalLM.from_config(config=config) + + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + cls.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10) + cls.inp = torch.ones([1, 10], dtype=torch.long) + cls.label = cls.tiny_llama_model(cls.inp)[0] + + @classmethod + def teardown_class(cls): + shutil.rmtree("saved_results", ignore_errors=True) + + def setup_method(self, method): + torch.compiler.reset() + logger.info(f"Running TestAutoRound test: {method.__name__}") + + @pytest.mark.skip(reason="Disabled, see JIRA: https://jira.habana-labs.com/browse/SW-227554") + def test_autoround_w4a8(self): + fp32_model = copy.deepcopy(self.tiny_llama_model) + quant_config = AutoRoundConfig( + nsamples=32, + seqlen=10, + iters=2, + scale_dtype="bf16", + dtype="fp8_to_int_sym", + act_bits=8, + act_group_size=-1, + act_dtype="fp8_sym", + act_dynamic=False, + ) + + quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) + logger.info(f"Test AutoRound with config {quant_config}") + + # prepare + convert API + model = prepare(model=fp32_model, quant_config=quant_config) + + run_fn(model, self.dataloader) + q_model = convert(model) + assert q_model is not None, "Quantization failed!" + # We quantize the model with compile mode, if we want to run the model directly, + # we need use the compile mode as well. + # We can use the lazy mode but need to restart the python process. + from neural_compressor.torch.algorithms.weight_only.save_load import load + + model = load( + model_name_or_path="temp_auto_round", + original_model=copy.deepcopy(self.tiny_llama_model), + device="hpu", + format="huggingface", + ) + print(f"loaded model {model}") + from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear + + has_hpu_mixed_precision_module = False + for name, module in model.named_modules(): + if isinstance(module, HPUMixedPrecisionLinear): + has_hpu_mixed_precision_module = True + break + assert has_hpu_mixed_precision_module, "loading compressed model failed." + model.eval() + model = model.to(torch.bfloat16) + model = torch.compile(model, backend="hpu_backend") + out = model(self.inp.to("hpu"))[0] + print(f"out: {out}") + assert out is not None, "Loading compressed model failed." + + def test_quant_lm_head(self): + model = transformers.AutoModelForCausalLM.from_pretrained( + "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM" + ) + tokenizer = AutoTokenizer.from_pretrained( + "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM", trust_remote_code=True + ) + + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + nsamples=32, + seqlen=10, + iters=1, + amp=False, + scale_dtype="fp32", + quant_lm_head=True, + group_size=32, + ) + logger.info(f"Test AutoRound with config {quant_config}") + text = "Replace me by any text you'd like." + encoded_input = tokenizer(text, return_tensors="pt") + model = prepare(model=model, quant_config=quant_config) + q_model = convert(model) + output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0]) + print(output) + assert output is not None + assert q_model.lm_head.__class__.__name__ in tagert_modules, "packing model failed." + + def test_int4_dtype(self): + fp32_model = copy.deepcopy(self.tiny_llama_model) + quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=1, amp=False, scale_dtype="fp32") + logger.info(f"Test AutoRound with config {quant_config}") + + # prepare + convert API + model = prepare(model=fp32_model, quant_config=quant_config) + + run_fn(model, self.dataloader) + q_model = convert(model) + _ = q_model(self.inp) # inference + assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed." + + def test_autoround_with_quantize_API(self): + fp32_model = copy.deepcopy(self.tiny_llama_model) + + quant_config = AutoRoundConfig(scheme="W4A16", seqlen=10, iters=1, use_sym=False, amp=False, scale_dtype="fp32") + logger.info(f"Test AutoRound with config {quant_config}") + + # quantize API + q_model = quantize( + model=fp32_model, + quant_config=quant_config, + run_fn=run_fn, + run_args=(self.dataloader,), + ) + _ = q_model(self.inp) # inference + tagert_modules = ["WQLinear_GEMM"] + assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed." diff --git a/test/torch/quantization/test_autoround_xpu.py b/test/torch/quantization/test_autoround_xpu.py new file mode 100644 index 00000000000..babd06a4009 --- /dev/null +++ b/test/torch/quantization/test_autoround_xpu.py @@ -0,0 +1,203 @@ +import shutil + +import pytest +import torch +import transformers + + +def is_xpu_available(): + return torch.xpu.is_available() + + +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, +) + +torch.backends.__allow_nonbracketed_mutation_flag = True + +try: + import auto_round + + auto_round_installed = True +except ImportError: + auto_round_installed = False + + +tagert_modules = ["QuantLinear", "QuantLinearGPTQ", "QuantLinearAWQ"] + + +@torch.no_grad() +def run_fn(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) + + +@pytest.mark.skipif(not is_xpu_available(), reason="XPU is not available") +@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed") +class TestAutoRoundXPU: + @pytest.mark.parametrize( + "scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"] + ) + def test_scheme(self, scheme): + # INC API + from transformers import AutoModelForCausalLM, AutoTokenizer + + fp32_model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + ) + inp = torch.ones([1, 10], dtype=torch.long) + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) + + output_dir = "./saved_inc" + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + nsamples=32, + seqlen=10, + iters=1, + device_map="xpu", + scheme=scheme, + export_format="auto_round", + output_dir=output_dir, # default is "temp_auto_round" + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + convert(model) + if scheme in ["FPW8A16"]: # FPW8A16 loading not supported yet + return + inc_model = AutoModelForCausalLM.from_pretrained( + output_dir, + ) + out = inc_model(inp)[0] + + # AutoRound API + from transformers import AutoModelForCausalLM, AutoTokenizer + + fp32_model = transformers.AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + ) + inp = torch.ones([1, 10], dtype=torch.long) + tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) + from auto_round import AutoRound + + ar = AutoRound( + model=fp32_model, + tokenizer=tokenizer, + nsamples=32, + seqlen=10, + iters=1, + device_map="xpu", + scheme=scheme, + ) + quantized_model_path = "./saved_ar" + ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + model = AutoModelForCausalLM.from_pretrained( + quantized_model_path, + ) + out_ar = model(inp)[0] + assert torch.all(out_ar.eq(out)) + shutil.rmtree(output_dir, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) + + @pytest.mark.parametrize("format", ["auto_awq", "auto_gptq", "llm_compressor"]) + def test_format(self, format): + # INC API + scheme = "W4A16" if format != "llm_compressor" else "MXFP4" + from transformers import AutoModelForCausalLM, AutoTokenizer + + fp32_model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + ) + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) + + output_dir = "./saved_inc" + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + nsamples=32, + seqlen=10, + iters=1, + device_map="xpu", + scheme=scheme, + export_format=format, + output_dir=output_dir, # default is "temp_auto_round" + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + inc_model = convert(model) + assert inc_model is not None + shutil.rmtree(output_dir, ignore_errors=True) + + def test_vlm_model(self): + # INC API + scheme = "W4A16" + model_name = "Qwen/Qwen2-VL-2B-Instruct" + from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration + + fp32_model = Qwen2VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2-VL-2B-Instruct", + ) + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True) + from neural_compressor.torch.algorithms.autoround import get_mllm_dataloader + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + output_dir = "./saved_inc" + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + nsamples=1, + iters=1, + seqlen=10, + # quant_nontext_module=True, + processor=processor, + device_map="xpu:0", + scheme=scheme, + export_format="auto_round", + output_dir=output_dir, # default is "temp_auto_round" + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + convert(model) + inc_model = Qwen2VLForConditionalGeneration.from_pretrained( + output_dir, + ) + assert inc_model is not None + shutil.rmtree(output_dir, ignore_errors=True) + + def test_quant_lm_head(self): + # INC API + scheme = "W4A16" + model_name = "Qwen/Qwen3-8B" + from transformers import AutoModelForCausalLM, AutoTokenizer + + fp32_model = AutoModelForCausalLM.from_pretrained( + model_name, + ) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + output_dir = "./saved_inc" + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + nsamples=1, + seqlen=10, + iters=0, # rtn + device_map="xpu", + scheme=scheme, + export_format="auto_round", + output_dir=output_dir, # default is "temp_auto_round" + quant_lm_head=True, + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + inc_model = convert(model) + assert inc_model is not None + shutil.rmtree(output_dir, ignore_errors=True) diff --git a/test/torch/requirements.txt b/test/torch/requirements.txt index 9c4c989f2b7..d8524dc7efb 100644 --- a/test/torch/requirements.txt +++ b/test/torch/requirements.txt @@ -1,4 +1,4 @@ -auto-round @ git+https://github.com/intel/auto-round.git@v0.10.1rc +auto-round @ git+https://github.com/intel/auto-round.git@main auto-round-lib compressed-tensors datasets diff --git a/test/torch/requirements_xpu.txt b/test/torch/requirements_xpu.txt new file mode 100644 index 00000000000..6a7670de8c5 --- /dev/null +++ b/test/torch/requirements_xpu.txt @@ -0,0 +1,3 @@ +auto-round @ git+https://github.com/intel/auto-round.git@main +auto-round-lib +compressed-tensors