diff --git a/.azure-pipelines/docker/Dockerfile_xpu.devel b/.azure-pipelines/docker/Dockerfile_xpu.devel
new file mode 100644
index 00000000000..982445501b0
--- /dev/null
+++ b/.azure-pipelines/docker/Dockerfile_xpu.devel
@@ -0,0 +1,50 @@
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ARG UBUNTU_VER=24.04
+FROM ubuntu:${UBUNTU_VER}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    ca-certificates \
+    git \
+    libomp-dev \
+    numactl \
+    time \
+    wget \
+    bc \
+    jq \
+    vim
+
+RUN apt-get install -y software-properties-common \
+    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc \
+    && apt-get install -y intel-media-va-driver-non-free libmfx-gen1 libvpl2 libvpl-tools libva-glx2 va-driver-all vainfo \
+    && apt-get install -y libze-dev intel-ocloc \
+    && apt-get install -y libze-intel-gpu-raytracing
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+RUN echo '#!/bin/sh\nexec /usr/local/bin/uv pip "$@"' > /usr/local/bin/pip && \
+    chmod +x /usr/local/bin/pip
+ARG USER_ID=1000
+ARG GROUP_ID=1000
+
+RUN groupadd -g ${GROUP_ID} hostgroup && \
+    useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser && \
+    groupadd -g 991 render && \
+    usermod -aG render hostuser
+
+USER hostuser
+
+ENV PATH="/home/hostuser/.venv/bin:$PATH"
+ENV VIRTUAL_ENV="/home/hostuser/.venv"
+ENV UV_NO_PROGRESS=1 \
+    UV_LINK_MODE=copy
+
+RUN uv venv --python=3.12 /home/hostuser/.venv
+RUN which python && python --version
+
+WORKDIR /home/hostuser
diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh
index 7ca9ed1707c..54bfa517607 100644
--- a/.azure-pipelines/scripts/install_nc.sh
+++ b/.azure-pipelines/scripts/install_nc.sh
@@ -1,25 +1,28 @@
 #!/bin/bash
-
+set -x
 echo -e "##[group]Install Neural Compressor ... "
 cd /neural-compressor
+
 if [[ $1 = *"3x_pt"* ]]; then
-    python -m pip install --no-cache-dir -r requirements_pt.txt
-    if [[ $1 = *"3x_pt_fp8"* ]]; then
+    pip install --no-cache-dir -r requirements_pt.txt
+    if [[ $1 = *"hpu"* ]]; then
         pip uninstall neural_compressor_3x_pt -y || true
-        python setup.py pt bdist_wheel
+    elif [[ $1 = *"xpu"* ]]; then
+        echo -e "\n Install torch XPU ... "
+        pip install torch==2.9.1 torchvision --index-url https://download.pytorch.org/whl/xpu
+        pip install auto-round-lib==0.10.2.1 # mapping torch and auto-round version
     else
         echo -e "\n Install torch CPU ... "
         pip install torch==2.9.1 torchvision --index-url https://download.pytorch.org/whl/cpu
         pip install auto-round-lib==0.10.2.1 # mapping torch and auto-round version
-        python -m pip install --no-cache-dir -r requirements.txt
-        python setup.py bdist_wheel
     fi
+    python setup.py pt bdist_wheel
     pip install --no-deps dist/neural_compressor*.whl --force-reinstall
 elif [[ $1 = *"3x_tf"* ]]; then
 	pip install tensorflow==2.19.0
     python -m pip install --no-cache-dir -r requirements.txt
     python -m pip install --no-cache-dir -r requirements_tf.txt
-    python setup.py bdist_wheel
+    python setup.py tf bdist_wheel
     pip install dist/neural_compressor*.whl --force-reinstall
 else
     python -m pip install --no-cache-dir -r requirements.txt
@@ -27,6 +30,7 @@ else
     pip install dist/neural_compressor*.whl --force-reinstall
 fi
 
-echo -e "\n pip list after install Neural Compressor ... "
 echo "##[endgroup]"
+
+echo -e "\n pip list after install Neural Compressor ... "
 pip list
diff --git a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
index 16e1f2c9189..e5b18f87e40 100644
--- a/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
+++ b/.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh
@@ -52,8 +52,7 @@ else
     TORCH_VISION_VERSION=${torchvision_version}
 fi
 
-
-/bin/bash run_model_trigger_common.sh \
+numactl --physcpubind="${NUMA_CPUSET:-0-15}" --membind="${NUMA_NODE:-0}" bash run_model_trigger_common.sh \
     --yaml=${yaml} \
     --framework=${FRAMEWORK} \
     --fwk_ver=${FRAMEWORK_VERSION} \
diff --git a/.azure-pipelines/scripts/ut/collect_log.sh b/.azure-pipelines/scripts/ut/collect_log.sh
deleted file mode 100644
index 9e34e7dac07..00000000000
--- a/.azure-pipelines/scripts/ut/collect_log.sh
+++ /dev/null
@@ -1,139 +0,0 @@
-source /neural-compressor/.azure-pipelines/scripts/change_color.sh
-
-pip install coverage
-export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
-coverage_log="/neural-compressor/log_dir/coverage_log"
-coverage_log_base="/neural-compressor/log_dir/coverage_log_base"
-coverage_compare="/neural-compressor/log_dir/coverage_compare.html"
-cd /neural-compressor/log_dir
-
-$BOLD_YELLOW && echo "##[group]collect coverage for PR branch" && $RESET
-mkdir -p coverage_PR
-cp ut_*_coverage/.coverage.* ./coverage_PR/
-
-cd coverage_PR
-coverage combine --keep --rcfile=${COVERAGE_RCFILE}
-cp .coverage /neural-compressor/.coverage
-cd /neural-compressor
-coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log}
-coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE}
-coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE}
-ls -l log_dir/coverage_PR/htmlcov
-
-cd /neural-compressor
-cp -r /neural-compressor/.azure-pipelines .azure-pipelines-pr
-git config --global --add safe.directory /neural-compressor
-git fetch
-git checkout master
-rm -rf build dist *egg-info
-echo y | pip uninstall neural-compressor
-cd /neural-compressor/.azure-pipelines-pr/scripts && bash install_nc.sh
-echo "##[endgroup]"
-
-$BOLD_YELLOW && echo "##[group]collect coverage for baseline" && $RESET
-coverage erase
-cd /neural-compressor/log_dir
-mkdir -p coverage_base
-cp ut-base_*_coverage/.coverage.* ./coverage_base/
-
-cd coverage_base
-coverage combine --keep --rcfile=${COVERAGE_RCFILE}
-cp .coverage /neural-compressor/.coverage
-cd /neural-compressor
-coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log_base}
-coverage html -d log_dir/coverage_base/htmlcov --rcfile=${COVERAGE_RCFILE}
-coverage xml -o log_dir/coverage_base/coverage.xml --rcfile=${COVERAGE_RCFILE}
-ls -l log_dir/coverage_base/htmlcov
-echo "##[endgroup]"
-
-get_coverage_data() {
-    # Input argument
-    local coverage_xml="$1"
-
-    # Get coverage data
-    local coverage_data=$(python3 -c "import xml.etree.ElementTree as ET; root = ET.parse('$coverage_xml').getroot(); print(ET.tostring(root).decode())")
-    if [[ -z "$coverage_data" ]]; then
-        echo "Failed to get coverage data from $coverage_xml."
-        exit 1
-    fi
-
-    # Get lines coverage
-    local lines_covered=$(echo "$coverage_data" | grep -o 'lines-covered="[0-9]*"' | cut -d '"' -f 2)
-    local lines_valid=$(echo "$coverage_data" | grep -o 'lines-valid="[0-9]*"' | cut -d '"' -f 2)
-    if [ $lines_valid == 0 ]; then
-        local lines_coverage=0
-    else
-        local lines_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $lines_covered / $lines_valid}")
-    fi
-
-    # Get branches coverage
-    local branches_covered=$(echo "$coverage_data" | grep -o 'branches-covered="[0-9]*"' | cut -d '"' -f 2)
-    local branches_valid=$(echo "$coverage_data" | grep -o 'branches-valid="[0-9]*"' | cut -d '"' -f 2)
-    if [ $branches_valid == 0 ]; then
-        local branches_coverage=0
-    else
-        local branches_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $branches_covered/$branches_valid}")
-    fi
-
-    # Return values
-    echo "$lines_covered $lines_valid $lines_coverage $branches_covered $branches_valid $branches_coverage"
-}
-
-$BOLD_YELLOW && echo "compare coverage" && $RESET
-
-coverage_PR_xml="log_dir/coverage_PR/coverage.xml"
-coverage_PR_data=$(get_coverage_data $coverage_PR_xml)
-read lines_PR_covered lines_PR_valid coverage_PR_lines_rate branches_PR_covered branches_PR_valid coverage_PR_branches_rate <<<"$coverage_PR_data"
-
-coverage_base_xml="log_dir/coverage_base/coverage.xml"
-coverage_base_data=$(get_coverage_data $coverage_base_xml)
-read lines_base_covered lines_base_valid coverage_base_lines_rate branches_base_covered branches_base_valid coverage_base_branches_rate <<<"$coverage_base_data"
-
-$BOLD_BLUE && echo "PR lines coverage: $lines_PR_covered/$lines_PR_valid ($coverage_PR_lines_rate%)" && $RESET
-$BOLD_BLUE && echo "PR branches coverage: $branches_PR_covered/$branches_PR_valid ($coverage_PR_branches_rate%)" && $RESET
-$BOLD_BLUE && echo "BASE lines coverage: $lines_base_covered/$lines_base_valid ($coverage_base_lines_rate%)" && $RESET
-$BOLD_BLUE && echo "BASE branches coverage: $branches_base_covered/$branches_base_valid ($coverage_base_branches_rate%)" && $RESET
-
-$BOLD_YELLOW && echo "clear upload path" && $RESET
-rm -fr log_dir/coverage_PR/.coverage*
-rm -fr log_dir/coverage_base/.coverage*
-rm -fr log_dir/ut-coverage-*
-
-# Declare an array to hold failed items
-declare -a fail_items=()
-
-if (( $(bc -l <<< "${coverage_PR_lines_rate}+0.05 < ${coverage_base_lines_rate}") )); then
-    fail_items+=("lines")
-fi
-if (( $(bc -l <<< "${coverage_PR_branches_rate}+0.05 < ${coverage_base_branches_rate}") )); then
-    fail_items+=("branches")
-fi
-
-if [[ ${#fail_items[@]} -ne 0 ]]; then
-    fail_items_str=$(
-        IFS=', '
-        echo "${fail_items[*]}"
-    )
-    for item in "${fail_items[@]}"; do
-        case "$item" in
-        lines)
-            decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_lines_rate - $coverage_base_lines_rate" | bc -l)))
-            ;;
-        branches)
-            decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_branches_rate - $coverage_base_branches_rate" | bc -l)))
-            ;;
-        *)
-            echo "Unknown item: $item"
-            continue
-            ;;
-        esac
-        $BOLD_RED && echo "Unit Test failed with ${item} coverage decrease ${decrease}%" && $RESET
-    done
-    $BOLD_RED && echo "compare coverage to give detail info" && $RESET
-    bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "FAILED" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
-    exit 1
-else
-    $BOLD_GREEN && echo "Unit Test success with coverage lines: ${coverage_PR_lines_rate}%, branches: ${coverage_PR_branches_rate}%" && $RESET
-    $BOLD_GREEN && echo "compare coverage to give detail info" && $RESET
-    bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "SUCCESS" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
-fi
diff --git a/.azure-pipelines/scripts/ut/coverage.3x_pt_fp8 b/.azure-pipelines/scripts/ut/coverage.3x_pt_hpu
similarity index 100%
rename from .azure-pipelines/scripts/ut/coverage.3x_pt_fp8
rename to .azure-pipelines/scripts/ut/coverage.3x_pt_hpu
diff --git a/.azure-pipelines/scripts/ut/coverage.file b/.azure-pipelines/scripts/ut/coverage.file
deleted file mode 100644
index b9dea8ecb02..00000000000
--- a/.azure-pipelines/scripts/ut/coverage.file
+++ /dev/null
@@ -1,30 +0,0 @@
-[run]
-branch = True
-
-[report]
-omit =
- */**/fake*yaml
- */**/fake.py
- */neural_compressor/model/nets_factory.py
- */neural_compressor/benchmark.py
- */neural_compressor/experimental/benchmark.py
- */neural_compressor/contrib/strategy/tpe.py
- */intel_extension_for_transformers/backends/*
- */intel_extension_for_transformers/optimization/utils/get_throughput.py
- */neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_decomposed_in.py
- */neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_in.py
- */neural_compressor/adaptor/tf_utils/graph_rewriter/int8/freeze_value.py
- */neural_compressor/template/*
- */neural_compressor/common/*
- */neural_compressor/torch/*
- */neural_compressor/tensorflow/*
-exclude_lines =
- pragma: no cover
- raise NotImplementedError
- raise TypeError
- if self.device == "gpu":
- if device == "gpu":
- except ImportError:
- except Exception as e:
- onnx_version < ONNX18_VERSION
- onnx_version >= ONNX18_VERSION
diff --git a/.azure-pipelines/scripts/ut/run_3x_pt.sh b/.azure-pipelines/scripts/ut/run_3x_pt.sh
index 6e01a70ea23..9be42675806 100644
--- a/.azure-pipelines/scripts/ut/run_3x_pt.sh
+++ b/.azure-pipelines/scripts/ut/run_3x_pt.sh
@@ -26,6 +26,8 @@ cd /neural-compressor/test || exit 1
 rm -rf torch/algorithms/fp8_quant
 rm -rf torch/quantization/fp8_quant
 rm -rf torch/algorithms/fp8_quant_xpu
+rm -rf torch/quantization/test_autoround_xpu.py
+rm -rf torch/quantization/test_autoround_hpu.py
 
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
@@ -37,7 +39,9 @@ numactl --physcpubind="${NUMA_CPUSET:-0-15}" --membind="${NUMA_NODE:-0}" bash ru
 
 cp report.html ${LOG_DIR}/
 
-if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
+set -x
+if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \
+[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
     echo "Find errors in pytest case, please check the output..."
     echo "Please search for '== FAILURES ==' or '== ERRORS =='"
     exit 1
diff --git a/.azure-pipelines/scripts/ut/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
similarity index 87%
rename from .azure-pipelines/scripts/ut/run_3x_pt_fp8.sh
rename to .azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
index bb9950e1f9d..53feeebba88 100644
--- a/.azure-pipelines/scripts/ut/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
@@ -23,16 +23,16 @@ pip install beautifulsoup4==4.13.5
 echo "##[endgroup]"
 pip list
 
-export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt_fp8
+export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt_hpu
 inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
 cd /neural-compressor/test || exit 1
 
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
-ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
+ut_log_name=${LOG_DIR}/ut_3x_pt_hpu.log
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/test_autoround.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/test_autoround_hpu.py 2>&1 | tee -a ${ut_log_name}
 
 # Below folder contains some special configuration for pytest so we need to enter the path and run it separately
 cd /neural-compressor/test/torch/algorithms/fp8_quant
@@ -50,7 +50,9 @@ mkdir -p report && mv *.html report
 pytest_html_merger -i ./report -o ./report.html
 cp report.html ${LOG_DIR}/
 
-if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
+set -x
+if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \
+[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
     echo "Find errors in pytest case, please check the output..."
     echo "Please search for '== FAILURES ==' or '== ERRORS =='"
     exit 1
diff --git a/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh b/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh
new file mode 100644
index 00000000000..6577b9ebd3d
--- /dev/null
+++ b/.azure-pipelines/scripts/ut/run_3x_pt_xpu.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+python -c "import neural_compressor as nc"
+test_case="run 3x Torch with XPU"
+echo "${test_case}"
+
+echo "##[section]Run import check"
+set -e
+python -c "import neural_compressor.torch"
+python -c "import neural_compressor.common"
+echo "##[section]import check pass"
+
+echo "##[group]set up UT env..."
+export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH
+uv pip install -r /neural-compressor/test/torch/requirements_xpu.txt
+uv pip install pytest-cov pytest-html
+uv pip list
+echo "##[endgroup]"
+
+echo "##[group]check xpu env..."
+echo "ZE_AFFINITY_MASK: ${ZE_AFFINITY_MASK}"
+python - <<'PY'
+import torch
+print("torch:", torch.__version__)
+print("xpu available:", torch.xpu.is_available())
+print("xpu count:", torch.xpu.device_count())
+PY
+echo "##[endgroup]"
+
+export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.3x_pt
+inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
+cd /neural-compressor/test || exit 1
+
+LOG_DIR=/neural-compressor/log_dir
+mkdir -p ${LOG_DIR}
+ut_log_name=${LOG_DIR}/ut_3x_pt_xpu.log
+
+find ./torch -name "test_autoround_xpu.py" | sed "s,\.\/,python -m pytest --cov=\"${inc_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_xpu.sh
+cat run_xpu.sh
+numactl --physcpubind="${NUMA_CPUSET:-0-27}" --membind="${NUMA_NODE:-0}" bash run_xpu.sh 2>&1 | tee ${ut_log_name}
+
+cp report.html ${LOG_DIR}/
+
+set -x
+if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \
+[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
+    echo "##[error]Find errors in pytest case, please check the output..."
+    exit 1
+fi
+
+# if ut pass, collect the coverage file into artifacts
+cp .coverage ${LOG_DIR}/.coverage
+
+echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/run_3x_tf.sh b/.azure-pipelines/scripts/ut/run_3x_tf.sh
index 79cb71ecd81..c2eb07dd724 100644
--- a/.azure-pipelines/scripts/ut/run_3x_tf.sh
+++ b/.azure-pipelines/scripts/ut/run_3x_tf.sh
@@ -64,7 +64,9 @@ pytest_html_merger -i ./report -o ./report.html
 
 cp report.html ${LOG_DIR}/
 
-if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
+set -x
+if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || \
+[ $(grep -c 'Killed' ${ut_log_name}) != 0 ] || [ $(grep -c 'core dumped' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
     echo "Find errors in pytest case, please check the output..."
     echo "Please search for '== FAILURES ==' or '== ERRORS =='"
     exit 1
diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
index 521cac1eada..e9e805deda7 100644
--- a/.azure-pipelines/template/docker-template.yml
+++ b/.azure-pipelines/template/docker-template.yml
@@ -21,18 +21,6 @@ parameters:
     default: "build"
 
 steps:
-  - task: Bash@3
-    inputs:
-      targetType: "inline"
-      script: |
-        docker ps -a
-        if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then
-            docker start $(docker ps -aq --filter "name=${{ parameters.containerName }}")
-            echo "remove left files through container ..."
-            docker exec ${{ parameters.containerName }} bash -c "ls -a /neural-compressor && rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* && ls -a /neural-compressor  || true"
-        fi
-    displayName: "Docker workspace clean up"
-
   - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}:
       - script: |
           rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
diff --git a/.azure-pipelines/template/model-template.yml b/.azure-pipelines/template/model-template.yml
index 9c9de9ba796..1186e04865e 100644
--- a/.azure-pipelines/template/model-template.yml
+++ b/.azure-pipelines/template/model-template.yml
@@ -22,7 +22,8 @@ steps:
       containerName: ${{ parameters.modelContainerName }}
 
   - script: |
-      docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
+      docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \
+      bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
       && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='env_setup'"
     displayName: Env setup
 
@@ -41,18 +42,20 @@ steps:
     displayName: "Download refer logs"
 
   - script: |
-      docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
+      docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \
+      bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
       && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='tuning'"
     displayName: Quantization
 
   - ${{ if ne(parameters.APIVersion, '3x') }}:
     - script: |
-        docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
+        docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} \
+        bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
         && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='int8_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)"
       displayName: INT8 Benchmark
 
     - script: |
-        docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
+        docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
         && bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='fp32_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)"
       displayName: FP32 Benchmark
 
diff --git a/.azure-pipelines/template/ut-template.yml b/.azure-pipelines/template/ut-template.yml
index bb00c7ea3c0..dcdacb0d7fb 100644
--- a/.azure-pipelines/template/ut-template.yml
+++ b/.azure-pipelines/template/ut-template.yml
@@ -25,7 +25,7 @@ parameters:
     default: "coverage"
   - name: utContainerName
     type: string
-    default: "utTest"
+    default: "INCUnitTest"
   - name: imageSource
     type: string
     default: "build"
@@ -42,7 +42,7 @@ steps:
       imageSource: ${{ parameters.imageSource }}
 
   - script: |
-      docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} ${{ parameters.utContainerName }} \
+      docker exec -e NUMA_NODE=${NUMA_NODE} -e NUMA_CPUSET=${NUMA_CPUSET} -e ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK} ${{ parameters.utContainerName }} \
       bash -c "cd /neural-compressor/.azure-pipelines/scripts \
       && bash install_nc.sh ${{ parameters.utScriptFileName }} \
       && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}"
diff --git a/.azure-pipelines/ut-3x-pt-fp8.yml b/.azure-pipelines/ut-3x-pt-hpu.yml
similarity index 90%
rename from .azure-pipelines/ut-3x-pt-fp8.yml
rename to .azure-pipelines/ut-3x-pt-hpu.yml
index 09c3c1d1322..4ac3b709c01 100644
--- a/.azure-pipelines/ut-3x-pt-fp8.yml
+++ b/.azure-pipelines/ut-3x-pt-hpu.yml
@@ -8,11 +8,11 @@ pr:
       - master
   paths:
     include:
-      - .azure-pipelines/scripts/ut/run_3x_pt_fp8.sh
+      - .azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
       - .azure-pipelines/scripts/install_nc.sh
       - .azure-pipelines/ut-3x-pt-fp8.yml
       - .azure-pipelines/template/docker-template.yml
-      - .azure-pipelines/scripts/ut/coverage.3x_pt_fp8
+      - .azure-pipelines/scripts/ut/coverage.3x_pt_hpu
       - neural_compressor/common
       - neural_compressor/torch
       - neural_compressor/transformers
@@ -30,7 +30,7 @@ variables:
   IMAGE_TAG: "py310"
   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
-  ARTIFACT_NAME: "UT_coverage_report_3x_pt_fp8"
+  ARTIFACT_NAME: "UT_coverage_report_3x_pt_hpu"
   REPO: $(Build.Repository.Uri)
 
 stages:
@@ -46,7 +46,7 @@ stages:
             parameters:
               imageSource: "pull"
               dockerConfigName: "commonDockerConfig"
-              utScriptFileName: "run_3x_pt_fp8"
+              utScriptFileName: "run_3x_pt_hpu"
               uploadPath: $(UPLOAD_PATH)
               utArtifact: "ut_3x"
 
@@ -62,7 +62,7 @@ stages:
             parameters:
               imageSource: "pull"
               dockerConfigName: "gitCloneDockerConfig"
-              utScriptFileName: "run_3x_pt_fp8"
+              utScriptFileName: "run_3x_pt_hpu"
               uploadPath: $(UPLOAD_PATH)
               utArtifact: "ut_3x_baseline"
 
@@ -90,7 +90,7 @@ stages:
               pip install -U pip setuptools
               python setup.py install pt
               cd ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts
-              bash ut/collect_log_3x.sh 3x_pt_fp8
+              bash ut/collect_log_3x.sh 3x_pt_hpu
             displayName: "Collect UT Coverage"
 
           - task: PublishCodeCoverageResults@2
diff --git a/.azure-pipelines/ut-3x-pt-xpu.yml b/.azure-pipelines/ut-3x-pt-xpu.yml
new file mode 100644
index 00000000000..5d7239d421b
--- /dev/null
+++ b/.azure-pipelines/ut-3x-pt-xpu.yml
@@ -0,0 +1,46 @@
+trigger: none
+
+pr:
+  autoCancel: true
+  drafts: false
+  branches:
+    include:
+      - master
+  paths:
+    include:
+      - neural_compressor/common
+      - neural_compressor/torch
+      - test/torch/quantization/test_autoround_xpu.py
+      - setup.py
+      - requirements_pt.txt
+      - .azure-pipelines/ut-3x-pt-xpu.yml
+      - .azure-pipelines/template/docker-template.yml
+      - .azure-pipelines/scripts/install_nc.sh
+      - .azure-pipelines/scripts/ut/run_3x_pt_xpu.sh
+
+pool: B60
+
+variables:
+  IMAGE_NAME: "neural-compressor"
+  IMAGE_TAG: "py312-xpu"
+  DOCKERFILE_NAME: "Dockerfile_xpu"
+  UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
+  REPO: $(Build.Repository.Uri)
+
+stages:
+  - stage: Torch
+    displayName: Unit Test 3x Torch
+    dependsOn: []
+    jobs:
+      - job:
+        displayName: Unit Test 3x Torch
+        steps:
+          - template: template/ut-template.yml
+            parameters:
+              dockerConfigName: "commonDockerConfig"
+              dockerFileName: $(DOCKERFILE_NAME)
+              repoTag: "$(IMAGE_TAG)"
+              utScriptFileName: "run_3x_pt_xpu"
+              uploadPath: $(UPLOAD_PATH)
+              utArtifact: "ut_3x_xpu"
+              utContainerName: "INCUnitTest$(NODE_LABEL)"
diff --git a/.azure-pipelines/ut-3x-pt.yml b/.azure-pipelines/ut-3x-pt.yml
index 67701b53f92..24b53ad633c 100644
--- a/.azure-pipelines/ut-3x-pt.yml
+++ b/.azure-pipelines/ut-3x-pt.yml
@@ -47,7 +47,7 @@ stages:
               utScriptFileName: "run_3x_pt"
               uploadPath: $(UPLOAD_PATH)
               utArtifact: "ut_3x"
-              utContainerName: "utTest$(NODE_LABEL)"
+              utContainerName: "INCUnitTest$(NODE_LABEL)"
 
 
   - stage: Torch_baseline
diff --git a/.azure-pipelines/ut-3x-tf.yml b/.azure-pipelines/ut-3x-tf.yml
index 13a404bd465..ab13932e707 100644
--- a/.azure-pipelines/ut-3x-tf.yml
+++ b/.azure-pipelines/ut-3x-tf.yml
@@ -42,6 +42,7 @@ stages:
               utScriptFileName: "run_3x_tf"
               uploadPath: $(UPLOAD_PATH)
               utArtifact: "ut_3x"
+              utContainerName: "INCUnitTest$(NODE_LABEL)"
 
   - stage: TensorFlow_baseline
     displayName: Unit Test 3x TensorFlow baseline
@@ -58,6 +59,7 @@ stages:
               uploadPath: $(UPLOAD_PATH)
               utArtifact: "ut_3x_baseline"
               repo: $(REPO)
+              utContainerName: "INCUnitTest$(NODE_LABEL)"
 
   - stage: Coverage
     displayName: "Coverage Compare"
diff --git a/requirements_pt.txt b/requirements_pt.txt
index 5f4518f9332..d908a1f87f1 100644
--- a/requirements_pt.txt
+++ b/requirements_pt.txt
@@ -1,4 +1,5 @@
 numpy
+packaging
 prettytable
 psutil
 py-cpuinfo
diff --git a/test/torch/quantization/test_autoround.py b/test/torch/quantization/test_autoround_cpu.py
similarity index 65%
rename from test/torch/quantization/test_autoround.py
rename to test/torch/quantization/test_autoround_cpu.py
index 85be4b65cc6..3dafc565f39 100644
--- a/test/torch/quantization/test_autoround.py
+++ b/test/torch/quantization/test_autoround_cpu.py
@@ -6,44 +6,8 @@
 import pytest
 import torch
 import transformers
-from packaging.version import Version, parse
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig
-
-
-@lru_cache(None)
-def is_habana_framework_installed():
-    """Check if Habana framework is installed.
-
-    Only check for the habana_frameworks package without importing it to avoid
-    initializing lazy-mode-related components.
-    """
-    from importlib.util import find_spec
-
-    package_spec = find_spec("habana_frameworks")
-    return package_spec is not None
-
-
-def set_hpu_torch_compile_envs():
-    if not is_habana_framework_installed():
-        return None
-    import torch._dynamo.config as dynamo_config
-    import torch._inductor.config as inductor_config
-
-    os.environ["PT_HPU_LAZY_MODE"] = "0"
-    os.environ["PT_ENABLE_INT64_SUPPORT"] = "1"
-    inductor_config.force_disable_caches = True
-    dynamo_config.inline_inbuilt_nn_modules = True
-
-
-# The `TestAutoRoundHPU` is expected to be run with `compile` mode,
-# so set the HPU environment variables before importing INC.
-if is_habana_framework_installed():
-    set_hpu_torch_compile_envs()
-
-
-def is_xpu_available():
-    return torch.xpu.is_available()
-
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from neural_compressor.torch.quantization import (
     AutoRoundConfig,
@@ -85,7 +49,6 @@ def run_fn(model, dataloader):
             model(data)
 
 
-@pytest.mark.skipif(is_habana_framework_installed(), reason="These tests are not supported on HPU for now.")
 @pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
 class TestAutoRoundCPU:
     @classmethod
@@ -671,306 +634,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
         shutil.rmtree(output_dir, ignore_errors=True)
-
-
-@pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed")
-@pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled")
-@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
-class TestAutoRoundHPU:
-    @classmethod
-    def setup_class(self):
-
-        model_name = "TheBloke/Llama-2-7B-Chat-GPTQ"
-        from neural_compressor.torch.algorithms.autoround import get_dataloader
-
-        config = LlamaConfig(num_hidden_layers=2)
-        with transformers.modeling_utils.no_init_weights():
-            self.tiny_llama_model = AutoModelForCausalLM.from_config(config=config)
-
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10)
-        self.inp = torch.ones([1, 10], dtype=torch.long)
-        self.label = self.tiny_llama_model(self.inp)[0]
-
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree("saved_results", ignore_errors=True)
-
-    def setup_method(self, method):
-        torch.compiler.reset()
-        logger.info(f"Running TestAutoRound test: {method.__name__}")
-
-    @pytest.mark.skip(reason="Disabled, see JIRA: https://jira.habana-labs.com/browse/SW-227554")
-    def test_autoround_w4a8(self):
-        fp32_model = copy.deepcopy(self.tiny_llama_model)
-        quant_config = AutoRoundConfig(
-            nsamples=32,
-            seqlen=10,
-            iters=2,
-            scale_dtype="bf16",
-            dtype="fp8_to_int_sym",
-            act_bits=8,
-            act_group_size=-1,
-            act_dtype="fp8_sym",
-            act_dynamic=False,
-        )
-
-        quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
-        logger.info(f"Test AutoRound with config {quant_config}")
-
-        # prepare + convert API
-        model = prepare(model=fp32_model, quant_config=quant_config)
-
-        run_fn(model, self.dataloader)
-        q_model = convert(model)
-        assert q_model is not None, "Quantization failed!"
-        # We quantize the model with compile mode, if we want to run the model directly,
-        # we need use the compile mode as well.
-        # We can use the lazy mode but need to restart the python process.
-        from neural_compressor.torch.algorithms.weight_only.save_load import load
-
-        model = load(
-            model_name_or_path="temp_auto_round",
-            original_model=copy.deepcopy(self.tiny_llama_model),
-            device="hpu",
-            format="huggingface",
-        )
-        print(f"loaded model {model}")
-        from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear
-
-        has_hpu_mixed_precision_module = False
-        for name, module in model.named_modules():
-            if isinstance(module, HPUMixedPrecisionLinear):
-                has_hpu_mixed_precision_module = True
-                break
-        assert has_hpu_mixed_precision_module, "loading compressed model failed."
-        model.eval()
-        model = model.to(torch.bfloat16)
-        model = torch.compile(model, backend="hpu_backend")
-        out = model(self.inp.to("hpu"))[0]
-        print(f"out: {out}")
-        assert out is not None, "Loading compressed model failed."
-
-    def test_quant_lm_head(self):
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM"
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM", trust_remote_code=True
-        )
-
-        quant_config = AutoRoundConfig(
-            tokenizer=tokenizer,
-            nsamples=32,
-            seqlen=10,
-            iters=1,
-            amp=False,
-            scale_dtype="fp32",
-            quant_lm_head=True,
-            group_size=32,
-        )
-        logger.info(f"Test AutoRound with config {quant_config}")
-        text = "Replace me by any text you'd like."
-        encoded_input = tokenizer(text, return_tensors="pt")
-        model = prepare(model=model, quant_config=quant_config)
-        q_model = convert(model)
-        output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0])
-        print(output)
-        assert output is not None
-        assert q_model.lm_head.__class__.__name__ in tagert_modules, "packing model failed."
-
-    def test_int4_dtype(self):
-        fp32_model = copy.deepcopy(self.tiny_llama_model)
-        quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=1, amp=False, scale_dtype="fp32")
-        logger.info(f"Test AutoRound with config {quant_config}")
-
-        # prepare + convert API
-        model = prepare(model=fp32_model, quant_config=quant_config)
-
-        run_fn(model, self.dataloader)
-        q_model = convert(model)
-        _ = q_model(self.inp)  # inference
-        assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed."
-
-    def test_autoround_with_quantize_API(self):
-        fp32_model = copy.deepcopy(self.tiny_llama_model)
-
-        quant_config = AutoRoundConfig(scheme="W4A16", seqlen=10, iters=1, use_sym=False, amp=False, scale_dtype="fp32")
-        logger.info(f"Test AutoRound with config {quant_config}")
-
-        # quantize API
-        q_model = quantize(
-            model=fp32_model,
-            quant_config=quant_config,
-            run_fn=run_fn,
-            run_args=(self.dataloader,),
-        )
-        _ = q_model(self.inp)  # inference
-        tagert_modules = ["WQLinear_GEMM"]
-        assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed."
-
-
-@pytest.mark.skipif(not is_xpu_available(), reason="These tests are not supported on XPU for now.")
-@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
-class TestAutoRoundGPU:
-    @pytest.mark.parametrize(
-        "scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"]
-    )
-    def test_scheme(self, scheme):
-        # INC API
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        fp32_model = AutoModelForCausalLM.from_pretrained(
-            "facebook/opt-125m",
-        )
-        inp = torch.ones([1, 10], dtype=torch.long)
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
-
-        output_dir = "./saved_inc"
-        quant_config = AutoRoundConfig(
-            tokenizer=tokenizer,
-            nsamples=32,
-            seqlen=10,
-            iters=1,
-            device_map="xpu",
-            scheme=scheme,
-            export_format="auto_round",
-            output_dir=output_dir,  # default is "temp_auto_round"
-        )
-
-        # quantizer execute
-        model = prepare(model=fp32_model, quant_config=quant_config)
-        inc_model = convert(model)
-        if scheme in ["FPW8A16"]:  # FPW8A16 loading not supported yet
-            return
-        inc_model = AutoModelForCausalLM.from_pretrained(
-            output_dir,
-        )
-        out = inc_model(inp)[0]
-
-        # AutoRound API
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
-            "facebook/opt-125m",
-        )
-        inp = torch.ones([1, 10], dtype=torch.long)
-        tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
-        from auto_round import AutoRound
-
-        ar = AutoRound(
-            model=fp32_model,
-            tokenizer=tokenizer,
-            nsamples=32,
-            seqlen=10,
-            iters=1,
-            device_map="xpu",
-            scheme=scheme,
-        )
-        quantized_model_path = "./saved_ar"
-        ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        out_ar = model(inp)[0]
-        assert torch.all(out_ar.eq(out))
-        shutil.rmtree(output_dir, ignore_errors=True)
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-    @pytest.mark.parametrize("format", ["auto_awq", "auto_gptq", "llm_compressor"])
-    def test_format(self, format):
-        # INC API
-        scheme = "W4A16" if format != "llm_compressor" else "MXFP4"
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        fp32_model = AutoModelForCausalLM.from_pretrained(
-            "facebook/opt-125m",
-        )
-        inp = torch.ones([1, 10], dtype=torch.long)
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
-
-        output_dir = "./saved_inc"
-        quant_config = AutoRoundConfig(
-            tokenizer=tokenizer,
-            nsamples=32,
-            seqlen=10,
-            iters=1,
-            device_map="xpu",
-            scheme=scheme,
-            export_format=format,
-            output_dir=output_dir,  # default is "temp_auto_round"
-        )
-
-        # quantizer execute
-        model = prepare(model=fp32_model, quant_config=quant_config)
-        inc_model = convert(model)
-        assert inc_model is not None
-        shutil.rmtree(output_dir, ignore_errors=True)
-
-    def test_vlm_model(self):
-        # INC API
-        scheme = "W4A16"
-        model_name = "Qwen/Qwen2-VL-2B-Instruct"
-        from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
-
-        fp32_model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-2B-Instruct",
-        )
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
-        from neural_compressor.torch.algorithms.autoround import get_mllm_dataloader
-
-        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-
-        output_dir = "./saved_inc"
-        quant_config = AutoRoundConfig(
-            tokenizer=tokenizer,
-            nsamples=1,
-            iters=1,
-            seqlen=10,
-            # quant_nontext_module=True,
-            processor=processor,
-            device_map="xpu:0",
-            scheme=scheme,
-            export_format="auto_round",
-            output_dir=output_dir,  # default is "temp_auto_round"
-        )
-
-        # quantizer execute
-        model = prepare(model=fp32_model, quant_config=quant_config)
-        inc_model = convert(model)
-        inc_model = Qwen2VLForConditionalGeneration.from_pretrained(
-            output_dir,
-        )
-        assert inc_model is not None
-        shutil.rmtree(output_dir, ignore_errors=True)
-
-    def test_quant_lm_head(self):
-        # INC API
-        scheme = "W4A16"
-        model_name = "Qwen/Qwen3-8B"
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        fp32_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-
-        output_dir = "./saved_inc"
-        quant_config = AutoRoundConfig(
-            tokenizer=tokenizer,
-            nsamples=1,
-            seqlen=10,
-            iters=0,  # rtn
-            device_map="xpu",
-            scheme=scheme,
-            export_format="auto_round",
-            output_dir=output_dir,  # default is "temp_auto_round"
-            quant_lm_head=True,
-        )
-
-        # quantizer execute
-        model = prepare(model=fp32_model, quant_config=quant_config)
-        inc_model = convert(model)
-        assert inc_model is not None
-        shutil.rmtree(output_dir, ignore_errors=True)
diff --git a/test/torch/quantization/test_autoround_hpu.py b/test/torch/quantization/test_autoround_hpu.py
new file mode 100644
index 00000000000..0b2039ee736
--- /dev/null
+++ b/test/torch/quantization/test_autoround_hpu.py
@@ -0,0 +1,209 @@
+import copy
+import os
+import shutil
+from functools import lru_cache
+
+import pytest
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig
+
+
+@lru_cache(None)
+def is_habana_framework_installed():
+    """Check if Habana framework is installed.
+
+    Only check for the habana_frameworks package without importing it to avoid
+    initializing lazy-mode-related components.
+    """
+    from importlib.util import find_spec
+
+    package_spec = find_spec("habana_frameworks")
+    return package_spec is not None
+
+
+def set_hpu_torch_compile_envs():
+    if not is_habana_framework_installed():
+        return None
+    import torch._dynamo.config as dynamo_config
+    import torch._inductor.config as inductor_config
+
+    os.environ["PT_HPU_LAZY_MODE"] = "0"
+    os.environ["PT_ENABLE_INT64_SUPPORT"] = "1"
+    inductor_config.force_disable_caches = True
+    dynamo_config.inline_inbuilt_nn_modules = True
+
+
+# The `TestAutoRoundHPU` is expected to be run with `compile` mode,
+# so set the HPU environment variables before importing INC.
+if is_habana_framework_installed():
+    set_hpu_torch_compile_envs()
+
+
+from neural_compressor.torch.quantization import (
+    AutoRoundConfig,
+    convert,
+    prepare,
+    quantize,
+)
+from neural_compressor.torch.utils import logger
+
+torch.backends.__allow_nonbracketed_mutation_flag = True
+
+try:
+    import auto_round
+
+    auto_round_installed = True
+except ImportError:
+    auto_round_installed = False
+
+
+tagert_modules = ["QuantLinear", "QuantLinearGPTQ", "QuantLinearAWQ"]
+
+
+@torch.no_grad()
+def run_fn(model, dataloader):
+    for data in dataloader:
+        if isinstance(data, tuple) or isinstance(data, list):
+            model(*data)
+        elif isinstance(data, dict):
+            model(**data)
+        else:
+            model(data)
+
+
+@pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed")
+@pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled")
+@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
+class TestAutoRoundHPU:
+    @classmethod
+    def setup_class(cls):
+
+        model_name = "TheBloke/Llama-2-7B-Chat-GPTQ"
+        from neural_compressor.torch.algorithms.autoround import get_dataloader
+
+        config = LlamaConfig(num_hidden_layers=2)
+        with transformers.modeling_utils.no_init_weights():
+            cls.tiny_llama_model = AutoModelForCausalLM.from_config(config=config)
+
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        cls.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10)
+        cls.inp = torch.ones([1, 10], dtype=torch.long)
+        cls.label = cls.tiny_llama_model(cls.inp)[0]
+
+    @classmethod
+    def teardown_class(cls):
+        shutil.rmtree("saved_results", ignore_errors=True)
+
+    def setup_method(self, method):
+        torch.compiler.reset()
+        logger.info(f"Running TestAutoRound test: {method.__name__}")
+
+    @pytest.mark.skip(reason="Disabled, see JIRA: https://jira.habana-labs.com/browse/SW-227554")
+    def test_autoround_w4a8(self):
+        fp32_model = copy.deepcopy(self.tiny_llama_model)
+        quant_config = AutoRoundConfig(
+            nsamples=32,
+            seqlen=10,
+            iters=2,
+            scale_dtype="bf16",
+            dtype="fp8_to_int_sym",
+            act_bits=8,
+            act_group_size=-1,
+            act_dtype="fp8_sym",
+            act_dynamic=False,
+        )
+
+        quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
+        logger.info(f"Test AutoRound with config {quant_config}")
+
+        # prepare + convert API
+        model = prepare(model=fp32_model, quant_config=quant_config)
+
+        run_fn(model, self.dataloader)
+        q_model = convert(model)
+        assert q_model is not None, "Quantization failed!"
+        # We quantize the model with compile mode, if we want to run the model directly,
+        # we need use the compile mode as well.
+        # We can use the lazy mode but need to restart the python process.
+        from neural_compressor.torch.algorithms.weight_only.save_load import load
+
+        model = load(
+            model_name_or_path="temp_auto_round",
+            original_model=copy.deepcopy(self.tiny_llama_model),
+            device="hpu",
+            format="huggingface",
+        )
+        print(f"loaded model {model}")
+        from neural_compressor.torch.algorithms.mixed_low_precision.modules import HPUMixedPrecisionLinear
+
+        has_hpu_mixed_precision_module = False
+        for name, module in model.named_modules():
+            if isinstance(module, HPUMixedPrecisionLinear):
+                has_hpu_mixed_precision_module = True
+                break
+        assert has_hpu_mixed_precision_module, "loading compressed model failed."
+        model.eval()
+        model = model.to(torch.bfloat16)
+        model = torch.compile(model, backend="hpu_backend")
+        out = model(self.inp.to("hpu"))[0]
+        print(f"out: {out}")
+        assert out is not None, "Loading compressed model failed."
+
+    def test_quant_lm_head(self):
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            "optimum-intel-internal-testing/tiny-random-Phi3ForCausalLM", trust_remote_code=True
+        )
+
+        quant_config = AutoRoundConfig(
+            tokenizer=tokenizer,
+            nsamples=32,
+            seqlen=10,
+            iters=1,
+            amp=False,
+            scale_dtype="fp32",
+            quant_lm_head=True,
+            group_size=32,
+        )
+        logger.info(f"Test AutoRound with config {quant_config}")
+        text = "Replace me by any text you'd like."
+        encoded_input = tokenizer(text, return_tensors="pt")
+        model = prepare(model=model, quant_config=quant_config)
+        q_model = convert(model)
+        output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0])
+        print(output)
+        assert output is not None
+        assert q_model.lm_head.__class__.__name__ in tagert_modules, "packing model failed."
+
+    def test_int4_dtype(self):
+        fp32_model = copy.deepcopy(self.tiny_llama_model)
+        quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=1, amp=False, scale_dtype="fp32")
+        logger.info(f"Test AutoRound with config {quant_config}")
+
+        # prepare + convert API
+        model = prepare(model=fp32_model, quant_config=quant_config)
+
+        run_fn(model, self.dataloader)
+        q_model = convert(model)
+        _ = q_model(self.inp)  # inference
+        assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed."
+
+    def test_autoround_with_quantize_API(self):
+        fp32_model = copy.deepcopy(self.tiny_llama_model)
+
+        quant_config = AutoRoundConfig(scheme="W4A16", seqlen=10, iters=1, use_sym=False, amp=False, scale_dtype="fp32")
+        logger.info(f"Test AutoRound with config {quant_config}")
+
+        # quantize API
+        q_model = quantize(
+            model=fp32_model,
+            quant_config=quant_config,
+            run_fn=run_fn,
+            run_args=(self.dataloader,),
+        )
+        _ = q_model(self.inp)  # inference
+        tagert_modules = ["WQLinear_GEMM"]
+        assert q_model.model.layers[0].self_attn.k_proj.__class__.__name__ in tagert_modules, "packing model failed."
diff --git a/test/torch/quantization/test_autoround_xpu.py b/test/torch/quantization/test_autoround_xpu.py
new file mode 100644
index 00000000000..babd06a4009
--- /dev/null
+++ b/test/torch/quantization/test_autoround_xpu.py
@@ -0,0 +1,203 @@
+import shutil
+
+import pytest
+import torch
+import transformers
+
+
+def is_xpu_available():
+    return torch.xpu.is_available()
+
+
+from neural_compressor.torch.quantization import (
+    AutoRoundConfig,
+    convert,
+    prepare,
+)
+
+torch.backends.__allow_nonbracketed_mutation_flag = True
+
+try:
+    import auto_round
+
+    auto_round_installed = True
+except ImportError:
+    auto_round_installed = False
+
+
+tagert_modules = ["QuantLinear", "QuantLinearGPTQ", "QuantLinearAWQ"]
+
+
+@torch.no_grad()
+def run_fn(model, dataloader):
+    for data in dataloader:
+        if isinstance(data, tuple) or isinstance(data, list):
+            model(*data)
+        elif isinstance(data, dict):
+            model(**data)
+        else:
+            model(data)
+
+
+@pytest.mark.skipif(not is_xpu_available(), reason="XPU is not available")
+@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
+class TestAutoRoundXPU:
+    @pytest.mark.parametrize(
+        "scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"]
+    )
+    def test_scheme(self, scheme):
+        # INC API
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        fp32_model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+        )
+        inp = torch.ones([1, 10], dtype=torch.long)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
+
+        output_dir = "./saved_inc"
+        quant_config = AutoRoundConfig(
+            tokenizer=tokenizer,
+            nsamples=32,
+            seqlen=10,
+            iters=1,
+            device_map="xpu",
+            scheme=scheme,
+            export_format="auto_round",
+            output_dir=output_dir,  # default is "temp_auto_round"
+        )
+
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        convert(model)
+        if scheme in ["FPW8A16"]:  # FPW8A16 loading not supported yet
+            return
+        inc_model = AutoModelForCausalLM.from_pretrained(
+            output_dir,
+        )
+        out = inc_model(inp)[0]
+
+        # AutoRound API
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+        )
+        inp = torch.ones([1, 10], dtype=torch.long)
+        tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
+        from auto_round import AutoRound
+
+        ar = AutoRound(
+            model=fp32_model,
+            tokenizer=tokenizer,
+            nsamples=32,
+            seqlen=10,
+            iters=1,
+            device_map="xpu",
+            scheme=scheme,
+        )
+        quantized_model_path = "./saved_ar"
+        ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path,
+        )
+        out_ar = model(inp)[0]
+        assert torch.all(out_ar.eq(out))
+        shutil.rmtree(output_dir, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    @pytest.mark.parametrize("format", ["auto_awq", "auto_gptq", "llm_compressor"])
+    def test_format(self, format):
+        # INC API
+        scheme = "W4A16" if format != "llm_compressor" else "MXFP4"
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        fp32_model = AutoModelForCausalLM.from_pretrained(
+            "facebook/opt-125m",
+        )
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True)
+
+        output_dir = "./saved_inc"
+        quant_config = AutoRoundConfig(
+            tokenizer=tokenizer,
+            nsamples=32,
+            seqlen=10,
+            iters=1,
+            device_map="xpu",
+            scheme=scheme,
+            export_format=format,
+            output_dir=output_dir,  # default is "temp_auto_round"
+        )
+
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        inc_model = convert(model)
+        assert inc_model is not None
+        shutil.rmtree(output_dir, ignore_errors=True)
+
+    def test_vlm_model(self):
+        # INC API
+        scheme = "W4A16"
+        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+        from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
+
+        fp32_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-2B-Instruct",
+        )
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
+        from neural_compressor.torch.algorithms.autoround import get_mllm_dataloader
+
+        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+        output_dir = "./saved_inc"
+        quant_config = AutoRoundConfig(
+            tokenizer=tokenizer,
+            nsamples=1,
+            iters=1,
+            seqlen=10,
+            # quant_nontext_module=True,
+            processor=processor,
+            device_map="xpu:0",
+            scheme=scheme,
+            export_format="auto_round",
+            output_dir=output_dir,  # default is "temp_auto_round"
+        )
+
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        convert(model)
+        inc_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            output_dir,
+        )
+        assert inc_model is not None
+        shutil.rmtree(output_dir, ignore_errors=True)
+
+    def test_quant_lm_head(self):
+        # INC API
+        scheme = "W4A16"
+        model_name = "Qwen/Qwen3-8B"
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        fp32_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+        output_dir = "./saved_inc"
+        quant_config = AutoRoundConfig(
+            tokenizer=tokenizer,
+            nsamples=1,
+            seqlen=10,
+            iters=0,  # rtn
+            device_map="xpu",
+            scheme=scheme,
+            export_format="auto_round",
+            output_dir=output_dir,  # default is "temp_auto_round"
+            quant_lm_head=True,
+        )
+
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        inc_model = convert(model)
+        assert inc_model is not None
+        shutil.rmtree(output_dir, ignore_errors=True)
diff --git a/test/torch/requirements.txt b/test/torch/requirements.txt
index 9c4c989f2b7..d8524dc7efb 100644
--- a/test/torch/requirements.txt
+++ b/test/torch/requirements.txt
@@ -1,4 +1,4 @@
-auto-round @ git+https://github.com/intel/auto-round.git@v0.10.1rc
+auto-round @ git+https://github.com/intel/auto-round.git@main
 auto-round-lib
 compressed-tensors
 datasets
diff --git a/test/torch/requirements_xpu.txt b/test/torch/requirements_xpu.txt
new file mode 100644
index 00000000000..6a7670de8c5
--- /dev/null
+++ b/test/torch/requirements_xpu.txt
@@ -0,0 +1,3 @@
+auto-round @ git+https://github.com/intel/auto-round.git@main
+auto-round-lib
+compressed-tensors