From 759dca37eeb9480daeaafdfe337d37d778755b69 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 18 Sep 2025 16:15:21 +0800
Subject: [PATCH 01/28] TEST: add api evaluate

---
 .github/workflows/api_eva.yml                 | 137 +++++++++++++++
 autotest/config.yaml                          |  22 +++
 autotest/conftest.py                          |  10 +-
 autotest/evaluate/eval_config_base.py         |  46 +++++
 autotest/evaluate/eval_config_chat.py         |  41 +++++
 .../evaluate/test_api_evaluate_pytorch.py     |  97 +++++++++++
 .../evaluate/test_api_evaluate_turbomind.py   |  97 +++++++++++
 autotest/utils/config_utils.py                |  87 +++++++++-
 autotest/utils/evaluate_utils.py              | 163 ++++++++++++++++++
 autotest/utils/run_restful_chat.py            |   4 +-
 10 files changed, 701 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/api_eva.yml
 create mode 100644 autotest/evaluate/eval_config_base.py
 create mode 100644 autotest/evaluate/eval_config_chat.py
 create mode 100644 autotest/evaluate/test_api_evaluate_pytorch.py
 create mode 100644 autotest/evaluate/test_api_evaluate_turbomind.py
 create mode 100644 autotest/utils/evaluate_utils.py

diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml
new file mode 100644
index 0000000000..782158bea4
--- /dev/null
+++ b/.github/workflows/api_eva.yml
@@ -0,0 +1,137 @@
+name: api_eva
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: '--lf'
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+
+jobs:
+  linux-build:
+    if: ${{ !cancelled() }}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.4
+      OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+  test_evaluation:
+    needs: linux-build
+    if: ${{ !cancelled() }}
+    runs-on: [self-hosted, test-140]
+    timeout-minutes: 2400
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+        - /mnt/187:/mnt/187
+    steps:
+      - name: Create and change to _wk directory
+        run: |
+          echo "Working directory set to: $(pwd)"
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Download Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r requirements_cuda.txt
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          python3 -m pip install opencompass
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          mkdir -p ${{ env.REPORT_DIR }}/.pytest_cache
+          ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
+        run: |
+          overall_exit=0
+          ln -s /mnt/187/opencompass-data/data ./data
+          pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py  -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py  -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          exit $overall_exit
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          rm -rf $workdir/*
\ No newline at end of file
diff --git a/autotest/config.yaml b/autotest/config.yaml
index fab9a5af89..8dddb7ecdc 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -6,6 +6,7 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 env_tag: a100
 
+
 tp_config:
     Llama-4-Scout-17B-16E-Instruct: 4
     Meta-Llama-3-1-70B-Instruct: 4
@@ -22,6 +23,7 @@ tp_config:
     Qwen3-32B: 2
     Qwen3-30B-A3B: 2
     Qwen3-30B-A3B-Base: 2
+    Qwen2.5-32B-Instruct : 2
     Qwen2.5-72B-Instruct: 4
     Qwen2.5-VL-32B-Instruct: 2
     DeepSeek-V2-Lite-Chat: 2
@@ -37,6 +39,7 @@ tp_config:
     gpt-oss-120b: 4
 
 
+
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
     - meta-llama/Llama-3.2-3B-Instruct
@@ -72,6 +75,7 @@ turbomind_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
+    - Qwen/Qwen2-7B-Instruct
     - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -134,6 +138,8 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
+    - Qwen/Qwen2-7B-Instruct
+    - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -370,3 +376,19 @@ benchmark_model:
     - deepseek-ai/DeepSeek-V2-Lite-Chat
     - openai/gpt-oss-20b
     - openai/gpt-oss-120b
+
+
+evaluate_model:
+  - Qwen/Qwen1.5-7B-Chat
+  - google/gemma-2-9b-it
+  - google/gemma-2-27b-it
+  - internlm/internlm2_5-7b-chat
+  - internlm/internlm3-8b-instruct
+  - meta-llama/Meta-Llama-3-8B-Instruct
+  - meta-llama/Meta-Llama-3-1-8B-Instruct
+  - Qwen/Qwen2-7B-Instruct
+  - Qwen/Qwen2.5-7B-Instruct
+  - Qwen/Qwen2.5-32B-Instruct
+  - meta-llama/Llama-2-7b-chat-hf
+  - Qwen/Qwen1.5-MoE-A2.7B-Chat
+  - internlm/internlm2_5-20b-chat
diff --git a/autotest/conftest.py b/autotest/conftest.py
index dee954d2cb..36392ac1c1 100644
--- a/autotest/conftest.py
+++ b/autotest/conftest.py
@@ -1,3 +1,4 @@
+import copy
 import os
 
 import pytest
@@ -23,7 +24,14 @@ def config():
 
     with open(config_path) as f:
         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
-    return env_config
+
+    config_copy = copy.deepcopy(env_config)
+    github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run')
+    if 'log_path' in config_copy:
+        config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id))
+        os.makedirs(config_copy['log_path'], exist_ok=True)
+
+    return config_copy
 
 
 @pytest.fixture(scope='session')
diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py
new file mode 100644
index 0000000000..80a68fb274
--- /dev/null
+++ b/autotest/evaluate/eval_config_base.py
@@ -0,0 +1,46 @@
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+
+with read_base():
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
+
+race_datasets = [race_datasets[1]]
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+MODEL_NAME = 'internlm2_5-1_8b'
+MODEL_PATH = '/nvme/qa_test_models/internlm/internlm2_5-1_8b'
+API_BASE = 'http://127.0.0.1:23333/v1'
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+models = [
+    dict(
+        type=OpenAISDK,
+        abbr=f'{MODEL_NAME}-lmdeploy-api',
+        openai_api_base=API_BASE,
+        key='EMPTY',  
+        path=MODEL_PATH,
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1, communicator='native'),
+        temperature=0.1,
+    )
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['gsm8k', 'accuracy'],
+        ['GPQA_diamond', 'accuracy'],
+        ['race-high', 'accuracy'],
+        ['winogrande', 'accuracy'],
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
new file mode 100644
index 0000000000..ac7d1ed54c
--- /dev/null
+++ b/autotest/evaluate/eval_config_chat.py
@@ -0,0 +1,41 @@
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+
+MODEL_NAME = 'Qwen2-7B-Instruct'
+MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct'
+API_BASE = 'http://127.0.0.1:65525/v1'
+
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+models = [
+    dict(
+        type=OpenAISDK,
+        abbr=f'{MODEL_NAME}-lmdeploy-api',
+        openai_api_base=API_BASE,
+        key='EMPTY', 
+        path=MODEL_PATH,
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        temperature=0.1,
+    )
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'naive_average'],
+        ['gsm8k', 'accuracy'],
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py
new file mode 100644
index 0000000000..7be55b505d
--- /dev/null
+++ b/autotest/evaluate/test_api_evaluate_pytorch.py
@@ -0,0 +1,97 @@
+import pytest
+from utils.config_utils import get_evaluate_pytorch_model_list, get_workerid
+from utils.evaluate_utils import restful_test
+from utils.run_restful_chat import start_restful_api, stop_restful_api
+
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    backend = param['backend']
+    print(param['model'], param['backend'], param['extra'])
+    model_path = config.get('model_path') + '/' + model
+    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
+    yield param
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
+    new_model_list = []
+    for model in model_list:
+        if model['backend'] == 'pytorch':
+            model['extra'] += '--cache-max-entry-count 0.8'
+        elif 'Llama-2' in model['model']:
+            model['extra'] += '--cache-max-entry-count 0.95'
+        elif 'internlm2' in model['model']:
+            model['extra'] += '--cache-max-entry-count 0.9'
+        model['cuda_prefix'] = None
+        new_model_list.append(model)
+    return new_model_list
+
+
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
+def test_restful_tp1(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
+def test_restful_tp2(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_4
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
+def test_restful_tp4(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_8
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True)
+def test_restful_tp8(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py
new file mode 100644
index 0000000000..943d7eb78c
--- /dev/null
+++ b/autotest/evaluate/test_api_evaluate_turbomind.py
@@ -0,0 +1,97 @@
+import pytest
+from utils.config_utils import get_evaluate_turbomind_model_list, get_workerid
+from utils.evaluate_utils import restful_test
+from utils.run_restful_chat import start_restful_api, stop_restful_api
+
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    backend = param['backend']
+    print(param['model'], param['backend'], param['extra'])
+    model_path = config.get('model_path') + '/' + model
+    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
+    yield param
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
+    new_model_list = []
+    for model in model_list:
+        if model['backend'] == 'pytorch':
+            model['extra'] += '--cache-max-entry-count 0.8'
+        elif 'Llama-2' in model['model']:
+            model['extra'] += '--cache-max-entry-count 0.95'
+        elif 'internlm2' in model['model']:
+            model['extra'] += '--cache-max-entry-count 0.9'
+        model['cuda_prefix'] = None
+        new_model_list.append(model)
+    return new_model_list
+
+
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
+def test_restful_tp1(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
+def test_restful_tp2(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_4
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
+def test_restful_tp4(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
+
+
+@pytest.mark.gpu_num_8
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True)
+def test_restful_tp8(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+
+    assert result, msg
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 0df8858b2c..9a403655ec 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -176,7 +176,14 @@ def get_config():
 
     with open(config_path) as f:
         config = yaml.load(f.read(), Loader=yaml.SafeLoader)
-    return config
+
+    config_copy = copy.deepcopy(config)
+    github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run')
+    if 'log_path' in config_copy:
+        config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id))
+        os.makedirs(config_copy['log_path'], exist_ok=True)
+
+    return config_copy
 
 
 def get_benchmark_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []):
@@ -227,6 +234,84 @@ def get_benchmark_model_list(tp_num, is_longtext: bool = False, kvint_list: list
     return result
 
 
+def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []):
+    config = get_config()
+
+    if is_longtext:
+        case_list_base = [item for item in config.get('longtext_model', [])]
+    else:
+        case_list_base = config.get('evaluate_model', config.get('benchmark_model', []))
+    quatization_case_config = config.get('turbomind_quatization')
+
+    case_list = copy.deepcopy(case_list_base)
+    for key in case_list_base:
+        if key in config.get('turbomind_chat_model') and key not in quatization_case_config.get(
+                'no_awq') and not is_quantization_model(key):
+            case_list.append(key + '-inner-4bits')
+
+    model_list = [item for item in case_list if get_tp_num(config, item) == tp_num]
+
+    result = []
+    if len(model_list) > 0:
+
+        communicators = ['native', 'nccl']
+        for communicator in communicators:
+            for item in model_list:
+                if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace(
+                        '-inner-4bits', '') in config.get('turbomind_base_model'):
+                    model_config = {
+                        'model': item,
+                        'backend': 'turbomind',
+                        'communicator': communicator,
+                        'quant_policy': 0,
+                        'tp_num': tp_num,
+                        'extra': f'--communicator {communicator} '
+                    }
+                    result.append(model_config)
+
+        for kvint in kvint_list:
+            for item in model_list:
+                if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') and item.replace(
+                        '-inner-4bits', '') not in quatization_case_config.get('no_kvint' + str(kvint)):
+                    model_config = {
+                        'model': item,
+                        'backend': 'turbomind',
+                        'quant_policy': kvint,
+                        'tp_num': tp_num,
+                        'extra': ''
+                    }
+                    result.append(model_config)
+    return result
+
+
+def get_evaluate_pytorch_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []):
+    config = get_config()
+
+    if is_longtext:
+        case_list_base = [item for item in config.get('longtext_model', [])]
+    else:
+        case_list_base = config.get('evaluate_model', config.get('benchmark_model', []))
+    pytorch_quatization_case_config = config.get('pytorch_quatization')
+
+    case_list = copy.deepcopy(case_list_base)
+
+    for key in case_list_base:
+        if key in config.get('pytorch_chat_model') and key in pytorch_quatization_case_config.get(
+                'w8a8') and not is_quantization_model(key):
+            case_list.append(key + '-inner-w8a8')
+
+    model_list = [item for item in case_list if get_tp_num(config, item) == tp_num]
+
+    result = []
+    if len(model_list) > 0:
+        for item in model_list:
+            if '4bits' not in item and (item.replace('-inner-w8a8', '') in config.get('pytorch_chat_model')
+                                        or item.replace('-inner-w8a8', '') in config.get('pytorch_base_model')):
+                model_config = {'model': item, 'backend': 'pytorch', 'tp_num': tp_num, 'extra': ''}
+                result.append(model_config)
+    return result
+
+
 def get_workerid(worker_id):
     if worker_id is None or 'gw' not in worker_id:
         return None
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
new file mode 100644
index 0000000000..edc352b94f
--- /dev/null
+++ b/autotest/utils/evaluate_utils.py
@@ -0,0 +1,163 @@
+import os
+import subprocess
+
+from mmengine.config import Config
+
+DEFAULT_PORT = 23333
+
+
+def get_model_type(model_name):
+    model_name_lower = model_name.lower()
+
+    chat_patterns = [
+        'chat',
+        'instruct',
+        'gemma',
+        'llama3',
+        'llama2',
+        'llama',
+    ]
+    if any(pattern in model_name_lower for pattern in chat_patterns):
+        return 'chat'
+    else:
+        return 'base'
+
+
+def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
+    try:
+        model_name = prepare_environment['model']
+        backend_type = prepare_environment['backend']
+        tp_num = prepare_environment.get('tp_num', 1)
+        communicator = prepare_environment.get('communicator', 'native')
+        quant_policy = prepare_environment.get('quant_policy', 0)
+
+        model_type = get_model_type(model_name)
+        print(f'Model {model_name} identified as {model_type} model')
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        parent_dir = os.path.dirname(current_dir)
+
+        if model_type == 'base':
+            config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py')
+        else:
+            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
+
+        model_base_path = config.get('model_path', '/nvme/qa_test_models')
+        model_path = os.path.join(model_base_path, model_name)
+
+        print(f'Starting OpenCompass evaluation for model: {model_name}')
+        print(f'Model path: {model_path}')
+        print(f'Backend: {backend_type}')
+        print(f'Model type: {model_type}')
+        print(f'Config file: {config_file}')
+
+        log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log')
+        os.makedirs(log_path, exist_ok=True)
+
+        original_cwd = os.getcwd()
+        work_dir = os.path.join(
+            log_path,
+            f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}")
+        os.makedirs(work_dir, exist_ok=True)
+
+        try:
+
+            if not os.path.exists(config_file):
+                return False, f'Config file {config_file} not found in any expected location'
+
+            cfg = Config.fromfile(config_file)
+
+            cfg.MODEL_NAME = model_name
+            cfg.MODEL_PATH = model_path
+            cfg.API_BASE = f'http://127.0.0.1:{port}/v1'
+
+            if cfg.models and len(cfg.models) > 0:
+                model_cfg = cfg.models[0]
+                model_cfg['abbr'] = f'{model_name}-lmdeploy-api'
+                model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'
+                model_cfg['path'] = model_path
+                if 'backend' in model_cfg:
+                    model_cfg['backend'] = backend_type
+
+                if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']:
+                    model_cfg['engine_config']['communicator'] = communicator
+
+            temp_config_file = f'temp_{model_name.replace("/", "_")}_{os.getpid()}.py'
+            temp_config_path = os.path.join(log_path, temp_config_file)
+
+            cfg.dump(temp_config_path)
+            print(f'Modified config saved to: {temp_config_path}')
+
+            cmd = ['opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir]
+            print(f"Running command: {' '.join(cmd)}")
+            print(f'Work directory: {work_dir}')
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=72000)
+
+            stdout_output = result.stdout
+            stderr_output = result.stderr
+
+            log_filename = (f'eval_{backend_type}_'
+                            f"{model_name.replace('/', '_')}_"
+                            f'{model_type}_'
+                            f'{communicator}_'
+                            f'{worker_id}_'
+                            f'{quant_policy}.log')
+            log_file = os.path.join(log_path, log_filename)
+
+            with open(log_file, 'w', encoding='utf-8') as f:
+                f.write(f'Model: {model_name}\n')
+                f.write(f'Model type: {model_type}\n')
+                f.write(f'Config file: {temp_config_file}\n')
+                f.write(f'Backend: {backend_type}\n')
+                f.write(f'TP Num: {tp_num}\n')
+                f.write(f'Command: {" ".join(cmd)}\n')
+                f.write(f'Work directory: {work_dir}\n')
+                f.write(f'STDOUT:\n{stdout_output}\n')
+                if stderr_output:
+                    f.write(f'STDERR:\n{stderr_output}\n')
+                f.write(f'Return code: {result.returncode}\n')
+
+            print(f'STDOUT:\n{stdout_output}')
+            if stderr_output:
+                print(f'STDERR:\n{stderr_output}')
+            print(f'Return code: {result.returncode}')
+
+            evaluation_failed = False
+            error_keywords = ['ERROR -', 'fail, see', 'task .* fail']
+            for line in stdout_output.split('\n'):
+                if any(keyword in line for keyword in error_keywords):
+                    evaluation_failed = True
+                    break
+
+            if result.returncode == 0 and not evaluation_failed:
+                return True, f'Evaluation completed successfully for {model_name} ({model_type})'
+            else:
+                error_msg = f'Evaluation failed for {model_name} ({model_type}) '
+                if result.returncode != 0:
+                    error_msg += f'with return code {result.returncode}'
+                elif evaluation_failed:
+                    error_msg += 'with internal errors detected in logs'
+
+                if stderr_output:
+                    error_msg += f'\nSTDERR: {stderr_output}'
+                else:
+                    error_lines = []
+                    for line in stdout_output.split('\n'):
+                        if any(keyword in line for keyword in error_keywords):
+                            error_lines.append(line)
+                    if error_lines:
+                        error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+
+                return False, error_msg
+
+        finally:
+            os.chdir(original_cwd)
+            print(f'Returned to directory: {original_cwd}')
+
+    except subprocess.TimeoutExpired:
+        timeout_msg = (f'Evaluation timed out for {model_name} '
+                       f'after 7200 seconds')
+        return False, timeout_msg
+    except Exception as e:
+        return False, f'Error during evaluation for {model_name}: {str(e)}'
\ No newline at end of file
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 5aca937681..cc67c559cc 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import os
 import subprocess
@@ -84,7 +85,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     if str(config.get('env_tag')) == '3090' or str(config.get('env_tag')) == '5080':
         cmd += ' --cache-max-entry-count 0.5'
 
-    start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')
+    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+    start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '_' + timestamp + '.log')
 
     print('reproduce command restful: ' + cmd)
 

From a955b7da069b3edbafdce83bc9f4af8eaffcdfb5 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 18 Sep 2025 17:02:14 +0800
Subject: [PATCH 02/28] TEST: rm qwen1.5_7b test

---
 .github/workflows/api_eva.yml                    | 2 +-
 autotest/config.yaml                             | 2 --
 autotest/evaluate/eval_config_base.py            | 2 +-
 autotest/evaluate/eval_config_chat.py            | 4 +---
 autotest/evaluate/test_api_evaluate_pytorch.py   | 1 -
 autotest/evaluate/test_api_evaluate_turbomind.py | 1 -
 autotest/utils/evaluate_utils.py                 | 2 +-
 7 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml
index 782158bea4..91e596e648 100644
--- a/.github/workflows/api_eva.yml
+++ b/.github/workflows/api_eva.yml
@@ -134,4 +134,4 @@ jobs:
         if: always()
         run: |
           export workdir=$(pwd)
-          rm -rf $workdir/*
\ No newline at end of file
+          rm -rf $workdir/*
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 8dddb7ecdc..637da0c6e3 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -139,7 +139,6 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
     - Qwen/Qwen2-7B-Instruct
-    - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -379,7 +378,6 @@ benchmark_model:
 
 
 evaluate_model:
-  - Qwen/Qwen1.5-7B-Chat
   - google/gemma-2-9b-it
   - google/gemma-2-27b-it
   - internlm/internlm2_5-7b-chat
diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py
index 80a68fb274..6a193b0402 100644
--- a/autotest/evaluate/eval_config_base.py
+++ b/autotest/evaluate/eval_config_base.py
@@ -25,7 +25,7 @@
         type=OpenAISDK,
         abbr=f'{MODEL_NAME}-lmdeploy-api',
         openai_api_base=API_BASE,
-        key='EMPTY',  
+        key='EMPTY',
         path=MODEL_PATH,
         meta_template=api_meta_template,
         max_out_len=2048,
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index ac7d1ed54c..c24cbb66b5 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -8,12 +8,10 @@
 
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
-
 MODEL_NAME = 'Qwen2-7B-Instruct'
 MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct'
 API_BASE = 'http://127.0.0.1:65525/v1'
 
-
 api_meta_template = dict(round=[
     dict(role='HUMAN', api_role='HUMAN'),
     dict(role='BOT', api_role='BOT', generate=True),
@@ -24,7 +22,7 @@
         type=OpenAISDK,
         abbr=f'{MODEL_NAME}-lmdeploy-api',
         openai_api_base=API_BASE,
-        key='EMPTY', 
+        key='EMPTY',
         path=MODEL_PATH,
         meta_template=api_meta_template,
         max_out_len=2048,
diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py
index 7be55b505d..d5c6f99447 100644
--- a/autotest/evaluate/test_api_evaluate_pytorch.py
+++ b/autotest/evaluate/test_api_evaluate_pytorch.py
@@ -11,7 +11,6 @@ def prepare_environment(request, config, worker_id):
     param = request.param
     model = param['model']
     backend = param['backend']
-    print(param['model'], param['backend'], param['extra'])
     model_path = config.get('model_path') + '/' + model
     pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
     yield param
diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py
index 943d7eb78c..70c6809ca4 100644
--- a/autotest/evaluate/test_api_evaluate_turbomind.py
+++ b/autotest/evaluate/test_api_evaluate_turbomind.py
@@ -11,7 +11,6 @@ def prepare_environment(request, config, worker_id):
     param = request.param
     model = param['model']
     backend = param['backend']
-    print(param['model'], param['backend'], param['extra'])
     model_path = config.get('model_path') + '/' + model
     pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
     yield param
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index edc352b94f..45d1225cc7 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -160,4 +160,4 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                        f'after 7200 seconds')
         return False, timeout_msg
     except Exception as e:
-        return False, f'Error during evaluation for {model_name}: {str(e)}'
\ No newline at end of file
+        return False, f'Error during evaluation for {model_name}: {str(e)}'

From aa8a0bd656db565adba0e3f60d5d63f7e2cfdb8f Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 18 Sep 2025 17:49:43 +0800
Subject: [PATCH 03/28] TEST: add evaluate result to github

---
 autotest/config.yaml                          |   3 -
 autotest/evaluate/eval_config_base.py         |  46 --------
 autotest/evaluate/eval_config_chat.py         |   6 +-
 .../evaluate/test_api_evaluate_pytorch.py     |   6 -
 .../evaluate/test_api_evaluate_turbomind.py   |   6 -
 autotest/utils/config_utils.py                |   5 +-
 autotest/utils/evaluate_utils.py              | 105 ++++++++++++------
 7 files changed, 79 insertions(+), 98 deletions(-)
 delete mode 100644 autotest/evaluate/eval_config_base.py

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 637da0c6e3..379fd666bb 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -75,7 +75,6 @@ turbomind_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
-    - Qwen/Qwen2-7B-Instruct
     - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -138,7 +137,6 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
-    - Qwen/Qwen2-7B-Instruct
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -384,7 +382,6 @@ evaluate_model:
   - internlm/internlm3-8b-instruct
   - meta-llama/Meta-Llama-3-8B-Instruct
   - meta-llama/Meta-Llama-3-1-8B-Instruct
-  - Qwen/Qwen2-7B-Instruct
   - Qwen/Qwen2.5-7B-Instruct
   - Qwen/Qwen2.5-32B-Instruct
   - meta-llama/Llama-2-7b-chat-hf
diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py
deleted file mode 100644
index 6a193b0402..0000000000
--- a/autotest/evaluate/eval_config_base.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models import OpenAISDK
-
-with read_base():
-    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
-        winogrande_datasets  # noqa: F401, E501
-
-race_datasets = [race_datasets[1]]
-datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
-
-MODEL_NAME = 'internlm2_5-1_8b'
-MODEL_PATH = '/nvme/qa_test_models/internlm/internlm2_5-1_8b'
-API_BASE = 'http://127.0.0.1:23333/v1'
-
-api_meta_template = dict(round=[
-    dict(role='HUMAN', api_role='HUMAN'),
-    dict(role='BOT', api_role='BOT', generate=True),
-])
-
-models = [
-    dict(
-        type=OpenAISDK,
-        abbr=f'{MODEL_NAME}-lmdeploy-api',
-        openai_api_base=API_BASE,
-        key='EMPTY',
-        path=MODEL_PATH,
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        batch_size=16,
-        run_cfg=dict(num_gpus=1, communicator='native'),
-        temperature=0.1,
-    )
-]
-
-summarizer = dict(
-    dataset_abbrs=[
-        ['gsm8k', 'accuracy'],
-        ['GPQA_diamond', 'accuracy'],
-        ['race-high', 'accuracy'],
-        ['winogrande', 'accuracy'],
-    ],
-    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
-)
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index c24cbb66b5..8d55ec232e 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -8,9 +8,9 @@
 
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
-MODEL_NAME = 'Qwen2-7B-Instruct'
-MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct'
-API_BASE = 'http://127.0.0.1:65525/v1'
+MODEL_NAME = ''
+MODEL_PATH = ''
+API_BASE = ''
 
 api_meta_template = dict(round=[
     dict(role='HUMAN', api_role='HUMAN'),
diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py
index d5c6f99447..79d77bade0 100644
--- a/autotest/evaluate/test_api_evaluate_pytorch.py
+++ b/autotest/evaluate/test_api_evaluate_pytorch.py
@@ -21,12 +21,6 @@ def getModelList(tp_num):
     model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
     for model in model_list:
-        if model['backend'] == 'pytorch':
-            model['extra'] += '--cache-max-entry-count 0.8'
-        elif 'Llama-2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.95'
-        elif 'internlm2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.9'
         model['cuda_prefix'] = None
         new_model_list.append(model)
     return new_model_list
diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py
index 70c6809ca4..38a838ff8b 100644
--- a/autotest/evaluate/test_api_evaluate_turbomind.py
+++ b/autotest/evaluate/test_api_evaluate_turbomind.py
@@ -21,12 +21,6 @@ def getModelList(tp_num):
     model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
     for model in model_list:
-        if model['backend'] == 'pytorch':
-            model['extra'] += '--cache-max-entry-count 0.8'
-        elif 'Llama-2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.95'
-        elif 'internlm2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.9'
         model['cuda_prefix'] = None
         new_model_list.append(model)
     return new_model_list
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 9a403655ec..fae0dbb6b9 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -254,7 +254,10 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l
     result = []
     if len(model_list) > 0:
 
-        communicators = ['native', 'nccl']
+        if tp_num > 1:
+            communicators = ['native', 'nccl']
+        else:
+            communicators = ['native']
         for communicator in communicators:
             for item in model_list:
                 if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace(
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 45d1225cc7..9ecc0dc724 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -1,3 +1,5 @@
+import csv
+import glob
 import os
 import subprocess
 
@@ -6,24 +8,62 @@
 DEFAULT_PORT = 23333
 
 
-def get_model_type(model_name):
-    model_name_lower = model_name.lower()
+def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
+    status = '✅ PASS' if result else '❌ FAIL'
 
-    chat_patterns = [
-        'chat',
-        'instruct',
-        'gemma',
-        'llama3',
-        'llama2',
-        'llama',
-    ]
-    if any(pattern in model_name_lower for pattern in chat_patterns):
-        return 'chat'
+    metrics = {}
+
+    if work_dir and os.path.exists(work_dir):
+        try:
+            summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary'))
+            if summary_dirs:
+                summary_dir = summary_dirs[0]
+                csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv'))
+                if csv_files:
+                    csv_file = sorted(csv_files)[-1]
+                    if os.path.exists(csv_file):
+                        with open(csv_file, 'r') as f:
+                            reader = csv.reader(f)
+                            next(reader)
+                            for row in reader:
+                                if len(row) >= 5 and row[4]:
+                                    dataset = row[0]
+                                    metric_value = row[4]
+                                    try:
+                                        metrics[dataset] = f'{float(metric_value):.2f}'
+                                    except ValueError:
+                                        metrics[dataset] = metric_value
+        except Exception as e:
+            print(f'Error reading metrics: {str(e)}')
+
+    mmlu_value = metrics.get('mmlu', '')
+    gsm8k_value = metrics.get('gsm8k', '')
+
+    summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
+
+    summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
+    if summary_file:
+        write_header = False
+        if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0:
+            write_header = True
+        else:
+            with open(summary_file, 'r') as f:
+                first_lines = f.read(200)
+                if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines:
+                    write_header = True
+
+        with open(summary_file, 'a') as f:
+            if write_header:
+                f.write('## Model Evaluation Results\n')
+                f.write('| Model | TP | Status | mmlu | gsm8k |\n')
+                f.write('|-------|----|--------|------|-------|\n')
+            f.write(summary_line)
     else:
-        return 'base'
+        print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
 
 
 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
+    work_dir = None
     try:
         model_name = prepare_environment['model']
         backend_type = prepare_environment['backend']
@@ -31,16 +71,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         communicator = prepare_environment.get('communicator', 'native')
         quant_policy = prepare_environment.get('quant_policy', 0)
 
-        model_type = get_model_type(model_name)
-        print(f'Model {model_name} identified as {model_type} model')
-
         current_dir = os.path.dirname(os.path.abspath(__file__))
         parent_dir = os.path.dirname(current_dir)
 
-        if model_type == 'base':
-            config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py')
-        else:
-            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
+        config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
 
         model_base_path = config.get('model_path', '/nvme/qa_test_models')
         model_path = os.path.join(model_base_path, model_name)
@@ -48,7 +82,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         print(f'Starting OpenCompass evaluation for model: {model_name}')
         print(f'Model path: {model_path}')
         print(f'Backend: {backend_type}')
-        print(f'Model type: {model_type}')
         print(f'Config file: {config_file}')
 
         log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log')
@@ -56,8 +89,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
         original_cwd = os.getcwd()
         work_dir = os.path.join(
-            log_path,
-            f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}")
+            log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}")
         os.makedirs(work_dir, exist_ok=True)
 
         try:
@@ -99,7 +131,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
             log_filename = (f'eval_{backend_type}_'
                             f"{model_name.replace('/', '_')}_"
-                            f'{model_type}_'
                             f'{communicator}_'
                             f'{worker_id}_'
                             f'{quant_policy}.log')
@@ -107,7 +138,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
             with open(log_file, 'w', encoding='utf-8') as f:
                 f.write(f'Model: {model_name}\n')
-                f.write(f'Model type: {model_type}\n')
                 f.write(f'Config file: {temp_config_file}\n')
                 f.write(f'Backend: {backend_type}\n')
                 f.write(f'TP Num: {tp_num}\n')
@@ -131,25 +161,29 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                     break
 
             if result.returncode == 0 and not evaluation_failed:
-                return True, f'Evaluation completed successfully for {model_name} ({model_type})'
+                final_result = True
+                final_msg = f'Evaluation completed successfully for {model_name}'
             else:
-                error_msg = f'Evaluation failed for {model_name} ({model_type}) '
+                final_result = False
+                final_msg = f'Evaluation failed for {model_name}'
                 if result.returncode != 0:
-                    error_msg += f'with return code {result.returncode}'
+                    final_msg += f'with return code {result.returncode}'
                 elif evaluation_failed:
-                    error_msg += 'with internal errors detected in logs'
+                    final_msg += 'with internal errors detected in logs'
 
                 if stderr_output:
-                    error_msg += f'\nSTDERR: {stderr_output}'
+                    final_msg += f'\nSTDERR: {stderr_output}'
                 else:
                     error_lines = []
                     for line in stdout_output.split('\n'):
                         if any(keyword in line for keyword in error_keywords):
                             error_lines.append(line)
                     if error_lines:
-                        error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+                        final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+
+            write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir)
 
-                return False, error_msg
+            return final_result, final_msg
 
         finally:
             os.chdir(original_cwd)
@@ -158,6 +192,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
     except subprocess.TimeoutExpired:
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
+        if work_dir:
+            write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir)
         return False, timeout_msg
     except Exception as e:
-        return False, f'Error during evaluation for {model_name}: {str(e)}'
+        error_msg = f'Error during evaluation for {model_name}: {str(e)}'
+        if work_dir:
+            write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir)
+        return False, error_msg

From 71022de17d56a197142b342faf0905e31a678c80 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 18 Sep 2025 19:38:50 +0800
Subject: [PATCH 04/28] CI: update workflow docker

---
 autotest/evaluate/eval_config_chat.py |  1 +
 autotest/utils/evaluate_utils.py      | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index 8d55ec232e..549605ac76 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -26,6 +26,7 @@
         path=MODEL_PATH,
         meta_template=api_meta_template,
         max_out_len=2048,
+        batch_size=500,
         temperature=0.1,
     )
 ]
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 9ecc0dc724..57a3275c33 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -8,7 +8,7 @@
 DEFAULT_PORT = 23333
 
 
-def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
+def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, work_dir=None):
     status = '✅ PASS' if result else '❌ FAIL'
 
     metrics = {}
@@ -39,7 +39,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
     mmlu_value = metrics.get('mmlu', '')
     gsm8k_value = metrics.get('gsm8k', '')
 
-    summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
+    summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
 
     summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
     if summary_file:
@@ -49,17 +49,17 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
         else:
             with open(summary_file, 'r') as f:
                 first_lines = f.read(200)
-                if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines:
+                if '| Model | Backend | TP | Status | mmlu | gsm8k |' not in first_lines:
                     write_header = True
 
         with open(summary_file, 'a') as f:
             if write_header:
                 f.write('## Model Evaluation Results\n')
-                f.write('| Model | TP | Status | mmlu | gsm8k |\n')
-                f.write('|-------|----|--------|------|-------|\n')
+                f.write('| Model | Backend | TP | Status | mmlu | gsm8k |\n')
+                f.write('|-------|---------|----|--------|------|-------|\n')
             f.write(summary_line)
     else:
-        print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
+        print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
 
 
 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
@@ -181,7 +181,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                     if error_lines:
                         final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
 
-            write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir)
+            write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
 
             return final_result, final_msg
 
@@ -193,10 +193,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
         if work_dir:
-            write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir)
+            write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir)
         return False, timeout_msg
     except Exception as e:
         error_msg = f'Error during evaluation for {model_name}: {str(e)}'
         if work_dir:
-            write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir)
+            write_to_summary(model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir)
         return False, error_msg

From 88a683672198443f99f95a8c7d61214f343a6a1b Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 19 Sep 2025 14:16:31 +0800
Subject: [PATCH 05/28] TEST: update code based on comments

---
 .../workflows/{api_eva.yml => api_eval.yml}   |   8 +-
 autotest/config.yaml                          |   6 +-
 autotest/evaluate/test_api_evaluate.py        | 119 ++++++++++++++++++
 .../evaluate/test_api_evaluate_pytorch.py     |  90 -------------
 .../evaluate/test_api_evaluate_turbomind.py   |  90 -------------
 autotest/utils/config_utils.py                |   4 +-
 autotest/utils/evaluate_utils.py              |  70 ++++++-----
 7 files changed, 165 insertions(+), 222 deletions(-)
 rename .github/workflows/{api_eva.yml => api_eval.yml} (87%)
 create mode 100644 autotest/evaluate/test_api_evaluate.py
 delete mode 100644 autotest/evaluate/test_api_evaluate_pytorch.py
 delete mode 100644 autotest/evaluate/test_api_evaluate_turbomind.py

diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eval.yml
similarity index 87%
rename from .github/workflows/api_eva.yml
rename to .github/workflows/api_eval.yml
index 91e596e648..46393a0ebb 100644
--- a/.github/workflows/api_eva.yml
+++ b/.github/workflows/api_eval.yml
@@ -1,4 +1,4 @@
-name: api_eva
+name: api_eval
 
 on:
   workflow_dispatch:
@@ -127,8 +127,10 @@ jobs:
         run: |
           overall_exit=0
           ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py  -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py  -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
           exit $overall_exit
       - name: Clear workspace
         if: always()
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 379fd666bb..ddef407ff6 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -378,12 +378,8 @@ benchmark_model:
 evaluate_model:
   - google/gemma-2-9b-it
   - google/gemma-2-27b-it
-  - internlm/internlm2_5-7b-chat
-  - internlm/internlm3-8b-instruct
-  - meta-llama/Meta-Llama-3-8B-Instruct
   - meta-llama/Meta-Llama-3-1-8B-Instruct
   - Qwen/Qwen2.5-7B-Instruct
   - Qwen/Qwen2.5-32B-Instruct
-  - meta-llama/Llama-2-7b-chat-hf
   - Qwen/Qwen1.5-MoE-A2.7B-Chat
-  - internlm/internlm2_5-20b-chat
+  - Qwen/Qwen3-30B-A3B
diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
new file mode 100644
index 0000000000..ffd3edc97e
--- /dev/null
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -0,0 +1,119 @@
+import pytest
+from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
+from utils.evaluate_utils import restful_test
+from utils.run_restful_chat import start_restful_api, stop_restful_api
+
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    backend = param['backend']
+    model_path = config.get('model_path') + '/' + model
+    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
+    yield param
+    stop_restful_api(pid, startRes, param)
+
+
+def get_turbomind_model_list(tp_num):
+    model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
+    new_model_list = []
+    for model in model_list:
+        model['cuda_prefix'] = None
+        new_model_list.append(model)
+    return new_model_list
+
+
+def get_pytorch_model_list(tp_num):
+    model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
+    new_model_list = []
+    for model in model_list:
+        model['cuda_prefix'] = None
+        new_model_list.append(model)
+    return new_model_list
+
+
+def run_test(config, run_id, prepare_environment, worker_id):
+    if get_workerid(worker_id) is None:
+        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
+    else:
+        result, msg = restful_test(config,
+                                   run_id,
+                                   prepare_environment,
+                                   worker_id=worker_id,
+                                   port=DEFAULT_PORT + get_workerid(worker_id))
+    return result, msg
+
+
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=1), indirect=True)
+def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=2), indirect=True)
+def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_4
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=4), indirect=True)
+def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.turbomind
+@pytest.mark.gpu_num_8
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=8), indirect=True)
+def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=1), indirect=True)
+def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=2), indirect=True)
+def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_4
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=4), indirect=True)
+def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
+
+
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_8
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=8), indirect=True)
+def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id):
+    result, msg = run_test(config, run_id, prepare_environment, worker_id)
+    assert result, msg
diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py
deleted file mode 100644
index 79d77bade0..0000000000
--- a/autotest/evaluate/test_api_evaluate_pytorch.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import pytest
-from utils.config_utils import get_evaluate_pytorch_model_list, get_workerid
-from utils.evaluate_utils import restful_test
-from utils.run_restful_chat import start_restful_api, stop_restful_api
-
-DEFAULT_PORT = 23333
-
-
-@pytest.fixture(scope='function', autouse=True)
-def prepare_environment(request, config, worker_id):
-    param = request.param
-    model = param['model']
-    backend = param['backend']
-    model_path = config.get('model_path') + '/' + model
-    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
-    yield param
-    stop_restful_api(pid, startRes, param)
-
-
-def getModelList(tp_num):
-    model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
-    new_model_list = []
-    for model in model_list:
-        model['cuda_prefix'] = None
-        new_model_list.append(model)
-    return new_model_list
-
-
-@pytest.mark.gpu_num_1
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
-def test_restful_tp1(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_2
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
-def test_restful_tp2(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_4
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
-def test_restful_tp4(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_8
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True)
-def test_restful_tp8(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py
deleted file mode 100644
index 38a838ff8b..0000000000
--- a/autotest/evaluate/test_api_evaluate_turbomind.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import pytest
-from utils.config_utils import get_evaluate_turbomind_model_list, get_workerid
-from utils.evaluate_utils import restful_test
-from utils.run_restful_chat import start_restful_api, stop_restful_api
-
-DEFAULT_PORT = 23333
-
-
-@pytest.fixture(scope='function', autouse=True)
-def prepare_environment(request, config, worker_id):
-    param = request.param
-    model = param['model']
-    backend = param['backend']
-    model_path = config.get('model_path') + '/' + model
-    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
-    yield param
-    stop_restful_api(pid, startRes, param)
-
-
-def getModelList(tp_num):
-    model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
-    new_model_list = []
-    for model in model_list:
-        model['cuda_prefix'] = None
-        new_model_list.append(model)
-    return new_model_list
-
-
-@pytest.mark.gpu_num_1
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
-def test_restful_tp1(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_2
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True)
-def test_restful_tp2(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_4
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True)
-def test_restful_tp4(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
-
-
-@pytest.mark.gpu_num_8
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True)
-def test_restful_tp8(config, run_id, prepare_environment, worker_id):
-    if get_workerid(worker_id) is None:
-        result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id)
-    else:
-        result, msg = restful_test(config,
-                                   run_id,
-                                   prepare_environment,
-                                   worker_id=worker_id,
-                                   port=DEFAULT_PORT + get_workerid(worker_id))
-
-    assert result, msg
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index fae0dbb6b9..1cc56403d8 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -255,9 +255,9 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l
     if len(model_list) > 0:
 
         if tp_num > 1:
-            communicators = ['native', 'nccl']
+            communicators = ['cuda-ipc', 'nccl']
         else:
-            communicators = ['native']
+            communicators = ['cuda-ipc']
         for communicator in communicators:
             for item in model_list:
                 if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace(
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 57a3275c33..417c55bff7 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -16,23 +16,33 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
     if work_dir and os.path.exists(work_dir):
         try:
             summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary'))
-            if summary_dirs:
-                summary_dir = summary_dirs[0]
-                csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv'))
-                if csv_files:
-                    csv_file = sorted(csv_files)[-1]
-                    if os.path.exists(csv_file):
-                        with open(csv_file, 'r') as f:
-                            reader = csv.reader(f)
-                            next(reader)
-                            for row in reader:
-                                if len(row) >= 5 and row[4]:
-                                    dataset = row[0]
-                                    metric_value = row[4]
-                                    try:
-                                        metrics[dataset] = f'{float(metric_value):.2f}'
-                                    except ValueError:
-                                        metrics[dataset] = metric_value
+            if not summary_dirs:
+                raise FileNotFoundError('No summary directory found')
+
+            summary_dir = summary_dirs[0]
+
+            csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv'))
+            if not csv_files:
+                raise FileNotFoundError('No CSV files found')
+
+            csv_file = sorted(csv_files)[-1]
+            if not os.path.exists(csv_file):
+                raise FileNotFoundError('CSV file does not exist')
+
+            with open(csv_file, 'r') as f:
+                reader = csv.reader(f)
+                next(reader)
+                for row in reader:
+                    if len(row) < 5 or not row[4]:
+                        continue
+
+                    dataset = row[0]
+                    metric_value = row[4]
+                    try:
+                        metrics[dataset] = f'{float(metric_value):.2f}'
+                    except ValueError:
+                        metrics[dataset] = metric_value
+
         except Exception as e:
             print(f'Error reading metrics: {str(e)}')
 
@@ -43,15 +53,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
 
     summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
     if summary_file:
-        write_header = False
-        if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0:
-            write_header = True
-        else:
-            with open(summary_file, 'r') as f:
-                first_lines = f.read(200)
-                if '| Model | Backend | TP | Status | mmlu | gsm8k |' not in first_lines:
-                    write_header = True
-
+        write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
         with open(summary_file, 'a') as f:
             if write_header:
                 f.write('## Model Evaluation Results\n')
@@ -68,9 +70,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         model_name = prepare_environment['model']
         backend_type = prepare_environment['backend']
         tp_num = prepare_environment.get('tp_num', 1)
-        communicator = prepare_environment.get('communicator', 'native')
+        communicator = prepare_environment.get('communicator', 'cuda-ipc')
         quant_policy = prepare_environment.get('quant_policy', 0)
 
+        summary_model_name = model_name
+        if quant_policy in [4, 8]:
+            summary_model_name = f'{model_name}-kvint{quant_policy}'
+
         current_dir = os.path.dirname(os.path.abspath(__file__))
         parent_dir = os.path.dirname(current_dir)
 
@@ -99,13 +105,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
             cfg = Config.fromfile(config_file)
 
-            cfg.MODEL_NAME = model_name
+            cfg.MODEL_NAME = summary_model_name
             cfg.MODEL_PATH = model_path
             cfg.API_BASE = f'http://127.0.0.1:{port}/v1'
 
             if cfg.models and len(cfg.models) > 0:
                 model_cfg = cfg.models[0]
-                model_cfg['abbr'] = f'{model_name}-lmdeploy-api'
+                model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api'
                 model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'
                 model_cfg['path'] = model_path
                 if 'backend' in model_cfg:
@@ -181,7 +187,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                     if error_lines:
                         final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
 
-            write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
 
             return final_result, final_msg
 
@@ -193,10 +199,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
         if work_dir:
-            write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir)
         return False, timeout_msg
     except Exception as e:
         error_msg = f'Error during evaluation for {model_name}: {str(e)}'
         if work_dir:
-            write_to_summary(model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir)
         return False, error_msg

From c68f4d2f8a3c807a3b0ac439c623cc9f4256bca5 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Fri, 19 Sep 2025 16:42:19 +0800
Subject: [PATCH 06/28] TEST: update docker

---
 .github/workflows/api_eval.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index 46393a0ebb..e10b6e01a8 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -79,7 +79,7 @@ jobs:
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip

From fd244e72504f28c1cce6797377e1b336a0255fab Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 19 Sep 2025 17:59:44 +0800
Subject: [PATCH 07/28] add H800 base model eval

---
 .github/scripts/eval_base_config.py           | 113 ++++++------
 .github/workflows/evaluate.yml                |  25 +--
 .github/workflows/evaluate_h800.yml           | 165 ++++++++++++++++++
 .../test_pipeline_chat_turbomind_mllm.py      |   2 +-
 .../test_restful_chat_hf_turbomind_llm.py     |  14 +-
 autotest/utils/config_utils.py                |   2 +-
 autotest/utils/pipeline_chat.py               |   4 +-
 autotest/utils/run_restful_chat.py            |   2 +-
 8 files changed, 242 insertions(+), 85 deletions(-)
 create mode 100644 .github/workflows/evaluate_h800.yml

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 64bbdfd972..7c9d151715 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -39,26 +39,6 @@
         wikibench_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
         winogrande_datasets  # noqa: F401, E501
-    from opencompass.configs.models.baichuan.hf_baichuan_7b import models as hf_baichuan_7b  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b import models as hf_gemma_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import models as hf_internlm2_5_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models as hf_internlm_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm_20b import models as hf_internlm_20b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
-        models as lmdeploy_internlm2_5_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama2_7b import models as hf_llama2_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_8b import models as hf_llama3_8b  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
-        models as hf_mixtral_8x7b_v0_1  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import models as lmdeploy_qwen2_5_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as hf_qwen1_5_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.hf_qwen2_7b import models as hf_qwen2_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.hf_qwen_7b import models as hf_qwen_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import models as lmdeploy_qwen1_5_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import models as lmdeploy_qwen2_7b  # noqa: F401, E501
     # Summary Groups
     from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups  # noqa: F401, E501
@@ -69,6 +49,14 @@
 
     # read models
 race_datasets = [race_datasets[1]]
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
+        'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
+        'professional_medicine', 'virology'
+    ]
+]
+
 summarizer = dict(
     dataset_abbrs=[
         ['race-high', 'accuracy'],
@@ -138,48 +126,69 @@
     summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
 
-turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
-turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
-turbomind_qwen2_5_7b = deepcopy(*lmdeploy_qwen2_5_7b)
-turbomind_qwen2_5_14b = deepcopy(*lmdeploy_qwen2_5_7b)
-turbomind_qwen2_5_14b['path'] = 'Qwen/Qwen2.5-14B'
-turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
-turbomind_internlm2_5_7b_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
-turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b)
-turbomind_internlm2_5_7b_batch1_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
-
 base_model = dict(
     type=TurboMindModel,
-    engine_config=dict(session_len=7168, max_batch_size=128, tp=1),
+    engine_config=dict(session_len=7168, tp=1),
     gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
     max_seq_len=7168,
     max_out_len=1024,
-    batch_size=128,
+    batch_size=1024,
     run_cfg=dict(num_gpus=1),
 )
 
+turbomind_qwen2_5_1_5b = deepcopy(base_model)
+turbomind_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
+turbomind_qwen2_5_7b = deepcopy(base_model)
+turbomind_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
+turbomind_qwen2_5_32b = deepcopy(base_model)
+turbomind_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
+turbomind_qwen2_5_32b['run_cfg']['num_gpus'] = 2
+turbomind_qwen2_5_32b['engine_config']['tp'] = 2
+turbomind_internlm2_5_7b = deepcopy(base_model)
+turbomind_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
+turbomind_glm_4_9b = deepcopy(base_model)
+turbomind_glm_4_9b['path'] = 'THUDM/glm-4-9b'
+turbomind_llama_3_70b = deepcopy(base_model)
+turbomind_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
+turbomind_llama_3_70b['run_cfg']['num_gpus'] = 4
+turbomind_llama_3_70b['engine_config']['tp'] = 4
+turbomind_llama_3_1_8b = deepcopy(base_model)
+turbomind_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
+turbomind_qwen3_0_6b_base = deepcopy(base_model)
+turbomind_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
 turbomind_qwen3_8b_base = deepcopy(base_model)
-pytorch_qwen3_8b_base = deepcopy(base_model)
-turbomind_qwen3_8b_base_4bits = deepcopy(base_model)
-turbomind_qwen3_8b_base_kvint8 = deepcopy(base_model)
-for model in [
-        v for k, v in locals().items()
-        if k.startswith('turbomind_qwen3_8b_base') or k.startswith('pytorch_qwen3_8b_base')
-]:
-    model['abbr'] = 'qwen3_8b_base_turbomind'
-    model['path'] = 'Qwen/Qwen3-8B-Base'
-    model['run_cfg']['num_gpus'] = 1
-    model['engine_config']['tp'] = 1
+turbomind_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
+turbomind_qwen3_30b_A3B_base = deepcopy(base_model)
+turbomind_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
+turbomind_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
+turbomind_qwen3_30b_A3B_base['engine_config']['tp'] = 2
 
-for model in [v for k, v in locals().items() if k.endswith('_4bits')]:
-    model['engine_config']['model_format'] = 'awq'
-    model['abbr'] = model['abbr'] + '_4bits'
-    model['path'] = model['path'] + '-inner-4bits'
-
-for model in [v for k, v in locals().items() if '_batch1' in k]:
-    model['abbr'] = model['abbr'] + '_batch1'
-    model['engine_config']['max_batch_size'] = 1
-    model['batch_size'] = 1
+pytorch_qwen2_5_1_5b = deepcopy(base_model)
+pytorch_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
+pytorch_qwen2_5_7b = deepcopy(base_model)
+pytorch_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
+pytorch_qwen2_5_32b = deepcopy(base_model)
+pytorch_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
+pytorch_qwen2_5_32b['run_cfg']['num_gpus'] = 2
+pytorch_qwen2_5_32b['engine_config']['tp'] = 2
+pytorch_internlm2_5_7b = deepcopy(base_model)
+pytorch_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
+pytorch_gemma_2_9b = deepcopy(base_model)
+pytorch_gemma_2_9b['path'] = 'google/gemma-2-9b'
+pytorch_llama_3_70b = deepcopy(base_model)
+pytorch_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
+pytorch_llama_3_70b['run_cfg']['num_gpus'] = 4
+pytorch_llama_3_70b['engine_config']['tp'] = 4
+pytorch_llama_3_1_8b = deepcopy(base_model)
+pytorch_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
+pytorch_qwen3_0_6b_base = deepcopy(base_model)
+pytorch_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
+pytorch_qwen3_8b_base = deepcopy(base_model)
+pytorch_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
+pytorch_qwen3_30b_A3B_base = deepcopy(base_model)
+pytorch_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
+pytorch_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
+pytorch_qwen3_30b_A3B_base['engine_config']['tp'] = 2
 
 for model in [v for k, v in locals().items() if k.startswith('pytorch_')]:
     model['abbr'] = model['abbr'].replace('turbomind', 'pytorch')
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index be64e8743f..bfb0840b34 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -13,26 +13,16 @@ on:
         description: 'Set branch or tag or commit id. Default is "main"'
         type: string
         default: 'main'
-      chat_models:
-        required: true
-        description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
-        type: string
-        default: '[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_qwen2_5_32b_instruct, pytorch_qwen2_5_32b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, pytorch_internlm2_5_7b_chat_w8a8, turbomind_internlm3_8b_instruct_4bits, turbomind_internlm3_8b_instruct_kvint4, turbomind_internlm3_8b_instruct_kvint8, pytorch_internlm3_8b_instruct_w8a8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8,turbomind_llama3_8b_instruct_kvint8, pytorch_llama3_1_8b_instruct_w8a8, turbomind_qwen2_7b_instruct_kvint8, turbomind_qwen2_5_7b_instruct_4bits, turbomind_qwen2_5_7b_instruct_kvint8, pytorch_qwen2_5_7b_instruct_w8a8, turbomind_qwen2_5_32b_instruct_4bits, turbomind_qwen2_5_32b_instruct_kvint8,turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8]'
-      chat_datasets:
-        required: true
-        description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]'
-        type: string
-        default: '[*mmlu_datasets, *gsm8k_datasets, *ifeval_datasets]'
       base_models:
         required: true
-        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_internlm2_5_7b_batch1, turbomind_internlm2_5_7b_batch1_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]'
+        description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]'
         type: string
-        default: '[turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]'
+        default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]'
       baes_datasets:
         required: true
         description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
         type: string
-        default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
+        default: '[*mmlu_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
       oc_repo_org:
         required: false
         description: 'Tested repository organization name. Default is open-compass/opencompass'
@@ -96,7 +86,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        evaluate_type: ['chat', 'base']
+        evaluate_type: ['base']
     container:
       image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -158,13 +148,6 @@ jobs:
         run: |
           ln -s /root/opencompass-data ./data
           python3 .github/scripts/action_tools.py create_model_links /root/models .
-      - name: Evaluate chat models
-        if: matrix.evaluate_type == 'chat'
-        run: |
-          echo ${{github.event.inputs.chat_models}}
-          echo ${{github.event.inputs.chat_datasets}}
-          export LMDEPLOY_DIR=$(pwd)
-          python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.chat_models}}" "${{github.event.inputs.chat_datasets}}" /root/evaluation-reports/${{ github.run_id }} chat
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
new file mode 100644
index 0000000000..85af1b53d8
--- /dev/null
+++ b/.github/workflows/evaluate_h800.yml
@@ -0,0 +1,165 @@
+name: evaluate
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      base_models:
+        required: true
+        description: 'Tested TurboMind models list. eg. [turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
+        type: string
+        default: '[turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
+      baes_datasets:
+        required: true
+        description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
+        type: string
+        default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
+      oc_repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is open-compass/opencompass'
+        type: string
+        default: 'open-compass/opencompass'
+      oc_repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
+jobs:
+  linux-build:
+    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.4
+      OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+  evaluate:
+    needs: linux-build
+    if: ${{github.event_name == 'schedule' || !cancelled()}}
+    runs-on: [self-hosted, linux-eval]
+    timeout-minutes: 4320 # 72hours
+    strategy:
+      fail-fast: false
+      matrix:
+        evaluate_type: ['base']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/root/models
+        - /mnt/187:/mnt/187
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /mnt/shared:/mnt/shared
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Setup systems
+        run: |
+          export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
+          echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: cp -r /root/models/offline_pkg/lmdeploy/. .
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r /root/models/offline_pkg/requirements.txt
+      - name: Install lmdeploy
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install lmdeploy - offline
+        if: ${{inputs.offline_mode}}
+        run: |
+          python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
+          cd opencompass
+          git checkout ${{ github.event.inputs.oc_repo_ref}}
+          python3 -m pip install .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /root/models .
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          echo ${{github.event.inputs.base_models}}
+          echo ${{github.event.inputs.baes_datasets}}
+          export LMDEPLOY_DIR=$(pwd)
+          python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation-reports/${{ github.run_id }} base
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index bcfd071eba..da7e255a8e 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -32,7 +32,7 @@ def test_pipeline_chat_tp2(config, model, communicator, worker_id):
         set_device_env_variable(worker_id, tp_num=2)
         os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
     if ('MiniCPM-V-2_6' in model or 'InternVL2_5-26B' in model or 'InternVL2-26B' in model
-            or 'InternVL3-38B' in model) and communicator == 'native':
+            or 'InternVL3-38B' in model) and communicator == 'cuda-ipc':
         return
     run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator})
 
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index daf2664662..33c5b4ba3c 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -245,25 +245,25 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id
         'model': 'google/gemma-2-27b-it',
         'cuda_prefix': None,
         'tp_num': 2,
-        'extra': ' --communicator native'
+        'extra': ' --communicator cuda-ipc'
     },
     {
         'model': 'deepseek-ai/deepseek-moe-16b-chat',
         'cuda_prefix': None,
         'tp_num': 2,
-        'extra': ' --communicator native'
+        'extra': ' --communicator cuda-ipc'
     },
     {
         'model': 'google/gemma-2-27b-it',
         'cuda_prefix': None,
         'tp_num': 2,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8 --communicator cuda-ipc'
     },
     {
         'model': 'deepseek-ai/deepseek-moe-16b-chat',
         'cuda_prefix': None,
         'tp_num': 2,
-        'extra': ' --quant-policy 8 --communicator native'
+        'extra': ' --quant-policy 8 --communicator cuda-ipc'
     },
 ],
                          indirect=True)
@@ -301,19 +301,19 @@ def test_restful_chat_fallback_backend_tp2(config, common_case_config, worker_id
         'model': 'internlm/internlm2_5-20b-chat',
         'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
         'tp_num': 2,
-        'extra': ' --communicator native'
+        'extra': ' --communicator cuda-ipc'
     },
     {
         'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
         'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
         'tp_num': 2,
-        'extra': ' --communicator native'
+        'extra': ' --communicator cuda-ipc'
     },
     {
         'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
         'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
         'tp_num': 2,
-        'extra': ' --communicator native'
+        'extra': ' --communicator cuda-ipc'
     },
 ],
                          indirect=True)
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 51de106840..ca0f969c2d 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -85,7 +85,7 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type:
 
 def get_communicator_list(tp_num: int = None):
     if tp_num != 1 and _is_bf16_supported_by_device():
-        return ['native', 'nccl']
+        return ['cuda-ipc', 'nccl']
     return ['nccl']
 
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index fabc074d37..2a8349b572 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -23,7 +23,7 @@ def run_pipeline_chat_test(config,
     # temp remove testcase because of issue 3434
     if ('InternVL3' in model_case or 'InternVL2_5' in model_case or 'MiniCPM-V-2_6' in model_case
         ) and 'turbomind' in backend_type and extra is not None and 'communicator' in extra and extra.get(
-            'communicator') == 'native' and tp > 1:
+            'communicator') == 'cuda-ipc' and tp > 1:
         return
     model_name = model_name = get_model_name(model_case)
     model_path = config.get('model_path')
@@ -104,7 +104,7 @@ def run_pipeline_vl_chat_test(config,
 
     if ('InternVL3' in model_case or 'InternVL2_5' in model_case or 'MiniCPM-V-2_6' in model_case
         ) and 'turbomind' in backend_type and extra is not None and 'communicator' in extra and extra.get(
-            'communicator') == 'native' and tp > 1:
+            'communicator') == 'cuda-ipc' and tp > 1:
         return
 
     pipeline_chat_log = os.path.join(
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 88c9468823..eca66f2fc0 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -31,7 +31,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
 
     # temp remove testcase because of issue 3434
     if ('InternVL3' in model or 'InternVL2_5' in model or 'MiniCPM-V-2_6' in model):
-        if 'turbomind' in backend_type and extra is not None and 'communicator native' in extra and tp_num > 1:
+        if 'turbomind' in backend_type and extra is not None and 'cuda-ipc' in extra and tp_num > 1:
             return
 
     if 'modelscope' in param.keys():

From cf1ddccd17cf73f61da3699c332cfdc26e26e617 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 19 Sep 2025 18:19:27 +0800
Subject: [PATCH 08/28] update

---
 .github/workflows/api_eval_h800.yml | 2 +-
 .github/workflows/evaluate.yml      | 2 +-
 .github/workflows/evaluate_h800.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 16b9ad07dc..f655b459a6 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -73,7 +73,7 @@ jobs:
   test_evaluation:
     needs: linux-build
     if: ${{ !cancelled() }}
-    runs-on: [self-hosted, test-140]
+    runs-on: [self-hosted, h800-r1]
     timeout-minutes: 2400
     strategy:
       fail-fast: false
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index bfb0840b34..d5c38605dc 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -81,7 +81,7 @@ jobs:
   evaluate:
     needs: linux-build
     if: ${{github.event_name == 'schedule' || !cancelled()}}
-    runs-on: [self-hosted, linux-eval]
+    runs-on: [self-hosted, test-140]
     timeout-minutes: 4320 # 72hours
     strategy:
       fail-fast: false
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index 8b8544e5da..4d5cb73796 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -81,7 +81,7 @@ jobs:
   evaluate:
     needs: linux-build
     if: ${{github.event_name == 'schedule' || !cancelled()}}
-    runs-on: [self-hosted, linux-eval]
+    runs-on: [self-hosted, h800-r1]
     timeout-minutes: 4320 # 72hours
     strategy:
       fail-fast: false

From b50e1f5c5c77b227f28160bdd193a2be370ea370 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 19 Sep 2025 19:17:30 +0800
Subject: [PATCH 09/28] update

---
 .github/workflows/api_eval_h800.yml | 2 +-
 .github/workflows/evaluate_h800.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index f655b459a6..496e0da25d 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -80,7 +80,7 @@ jobs:
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index 4d5cb73796..e54e939c53 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -88,7 +88,7 @@ jobs:
       matrix:
         evaluate_type: ['base']
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip

From 795132580a4c48a9df7d8e02b596c6ae03b3aa57 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Sun, 21 Sep 2025 00:21:39 +0800
Subject: [PATCH 10/28] Update eval_base_config.py

---
 .github/scripts/eval_base_config.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 7c9d151715..d8dc388c7d 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -138,58 +138,77 @@
 
 turbomind_qwen2_5_1_5b = deepcopy(base_model)
 turbomind_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
+turbomind_qwen2_5_1_5b['abbr'] = 'turbomind_qwen2_5_1_5b'
 turbomind_qwen2_5_7b = deepcopy(base_model)
 turbomind_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
+turbomind_qwen2_5_7b['abbr'] = 'turbomind_qwen2_5_7b'
 turbomind_qwen2_5_32b = deepcopy(base_model)
 turbomind_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
+turbomind_qwen2_5_32b['abbr'] = 'turbomind_qwen2_5_32b'
 turbomind_qwen2_5_32b['run_cfg']['num_gpus'] = 2
 turbomind_qwen2_5_32b['engine_config']['tp'] = 2
 turbomind_internlm2_5_7b = deepcopy(base_model)
 turbomind_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
+turbomind_internlm2_5_7b['abbr'] = 'turbomind_internlm2_5_7b'
 turbomind_glm_4_9b = deepcopy(base_model)
 turbomind_glm_4_9b['path'] = 'THUDM/glm-4-9b'
+turbomind_glm_4_9b['abbr'] = 'turbomind_glm_4_9b'
 turbomind_llama_3_70b = deepcopy(base_model)
 turbomind_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
+turbomind_llama_3_70b['abbr'] = 'turbomind_llama_3_70b'
 turbomind_llama_3_70b['run_cfg']['num_gpus'] = 4
 turbomind_llama_3_70b['engine_config']['tp'] = 4
 turbomind_llama_3_1_8b = deepcopy(base_model)
 turbomind_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
+turbomind_llama_3_1_8b['abbr'] = 'turbomind_llama_3_1_8b'
 turbomind_qwen3_0_6b_base = deepcopy(base_model)
 turbomind_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
+turbomind_qwen3_0_6b_base['abbr'] = 'turbomind_qwen3_0_6b_base'
 turbomind_qwen3_8b_base = deepcopy(base_model)
 turbomind_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
+turbomind_qwen3_8b_base['abbr'] = 'turbomind_qwen3_8b_base'
 turbomind_qwen3_30b_A3B_base = deepcopy(base_model)
 turbomind_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
+turbomind_qwen3_30b_A3B_base['abbr'] = 'turbomind_qwen3_30b_A3B_base'
 turbomind_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
 turbomind_qwen3_30b_A3B_base['engine_config']['tp'] = 2
 
 pytorch_qwen2_5_1_5b = deepcopy(base_model)
 pytorch_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
+pytorch_qwen2_5_1_5b['abbr'] = 'pytorch_qwen2_5_1_5b'
 pytorch_qwen2_5_7b = deepcopy(base_model)
 pytorch_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
+pytorch_qwen2_5_7b['abbr'] = 'pytorch_qwen2_5_7b'
 pytorch_qwen2_5_32b = deepcopy(base_model)
 pytorch_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
+pytorch_qwen2_5_32b['abbr'] = 'pytorch_qwen2_5_32b'
 pytorch_qwen2_5_32b['run_cfg']['num_gpus'] = 2
 pytorch_qwen2_5_32b['engine_config']['tp'] = 2
 pytorch_internlm2_5_7b = deepcopy(base_model)
 pytorch_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
+pytorch_internlm2_5_7b['abbr'] = 'pytorch_internlm2_5_7b'
 pytorch_gemma_2_9b = deepcopy(base_model)
 pytorch_gemma_2_9b['path'] = 'google/gemma-2-9b'
+pytorch_gemma_2_9b['abbr'] = 'pytorch_gemma_2_9b'
 pytorch_llama_3_70b = deepcopy(base_model)
 pytorch_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
+pytorch_llama_3_70b['abbr'] = 'pytorch_llama_3_70b'
 pytorch_llama_3_70b['run_cfg']['num_gpus'] = 4
 pytorch_llama_3_70b['engine_config']['tp'] = 4
 pytorch_llama_3_1_8b = deepcopy(base_model)
 pytorch_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
+pytorch_llama_3_1_8b['abbr'] = 'pytorch_llama_3_1_8b'
 pytorch_qwen3_0_6b_base = deepcopy(base_model)
 pytorch_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
+pytorch_qwen3_0_6b_base['abbr'] = 'pytorch_qwen3_0_6b_base'
 pytorch_qwen3_8b_base = deepcopy(base_model)
 pytorch_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
+pytorch_qwen3_8b_base['abbr'] = 'pytorch_qwen3_8b_base'
 pytorch_qwen3_30b_A3B_base = deepcopy(base_model)
 pytorch_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
+pytorch_qwen3_30b_A3B_base['abbr'] = 'pytorch_qwen3_30b_A3B_base'
 pytorch_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
 pytorch_qwen3_30b_A3B_base['engine_config']['tp'] = 2
 
 for model in [v for k, v in locals().items() if k.startswith('pytorch_')]:
-    model['abbr'] = model['abbr'].replace('turbomind', 'pytorch')
     model['backend'] = 'pytorch'

From a4a903bec4c862974f0044d983cb045240955f76 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Sun, 21 Sep 2025 00:49:44 +0800
Subject: [PATCH 11/28] update

---
 .github/workflows/api_eval_h800.yml       | 2 +-
 .github/workflows/daily_ete_test_h800.yml | 2 --
 autotest/config-h800.yaml                 | 1 +
 autotest/config.yaml                      | 1 +
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 496e0da25d..b7cc491d59 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -32,7 +32,6 @@ env:
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
   DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
-  DEVICE: h800
 
 jobs:
   linux-build:
@@ -115,6 +114,7 @@ jobs:
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+          mv autotest/config-h800.yaml autotest/config.yaml
       - name: Install opencompass
         run: |
           python3 -m pip install opencompass
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
index 1dab90bebf..9f1db0dce8 100644
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -179,7 +179,6 @@ jobs:
           lmdeploy check_env
           rm -rf allure-results
           # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat
@@ -270,7 +269,6 @@ jobs:
           lmdeploy check_env
           rm -rf allure-results
           # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index 553039938d..d2ed946ac2 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models
 resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
+eval_log_path: /nvme/qa_test_models/evaluation_report
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 env_tag: h800
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 4d82aabc81..8973f21fd5 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models
 resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
+eval_log_path: /nvme/qa_test_models/evaluation_report
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 env_tag: a100

From cbed0dcf58216bd59466156888481a6431f523bc Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 09:53:28 +0800
Subject: [PATCH 12/28] update

---
 autotest/evaluate/eval_config_chat.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index 549605ac76..ea37f858cf 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -6,6 +6,14 @@
     from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
 
+mmlu_datasets = [
+    x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
+        'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
+        'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
+        'professional_medicine', 'virology'
+    ]
+]
+
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
 MODEL_NAME = ''
@@ -35,6 +43,7 @@
     dataset_abbrs=[
         ['mmlu', 'naive_average'],
         ['gsm8k', 'accuracy'],
+        'mmlu-other',
     ],
     summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )

From a72ea20510beb8fdc519298312815579ebbcbda7 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 11:21:27 +0800
Subject: [PATCH 13/28] update max_out_len

---
 autotest/evaluate/eval_config_chat.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index ea37f858cf..122f4e1a94 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -1,5 +1,6 @@
 from mmengine.config import read_base
 from opencompass.models import OpenAISDK
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 
 with read_base():
     from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets  # noqa: F401, E501
@@ -26,17 +27,16 @@
 ])
 
 models = [
-    dict(
-        type=OpenAISDK,
-        abbr=f'{MODEL_NAME}-lmdeploy-api',
-        openai_api_base=API_BASE,
-        key='EMPTY',
-        path=MODEL_PATH,
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        batch_size=500,
-        temperature=0.1,
-    )
+    dict(type=OpenAISDK,
+         abbr=f'{MODEL_NAME}-lmdeploy-api',
+         openai_api_base=API_BASE,
+         key='EMPTY',
+         path=MODEL_PATH,
+         meta_template=api_meta_template,
+         max_out_len=32768,
+         batch_size=500,
+         temperature=0.1,
+         pred_postprocessor=dict(type=extract_non_reasoning_content))
 ]
 
 summarizer = dict(

From a42ac7ee2adae7c1bc43f51b8be7514e0b802252 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 11:48:23 +0800
Subject: [PATCH 14/28] set oc data path

---
 .github/workflows/api_eval.yml      | 2 +-
 .github/workflows/api_eval_h800.yml | 1 +
 .github/workflows/evaluate.yml      | 2 +-
 .github/workflows/evaluate_h800.yml | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index e10b6e01a8..9e3e327f5a 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -32,6 +32,7 @@ env:
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
   DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
 
 jobs:
   linux-build:
@@ -85,7 +86,6 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /mnt/shared:/mnt/shared
diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index b7cc491d59..0d3518ffde 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -32,6 +32,7 @@ env:
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
   DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
 
 jobs:
   linux-build:
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index d5c38605dc..4395ee69b4 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -41,6 +41,7 @@ on:
 
 env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
 
 jobs:
   linux-build:
@@ -94,7 +95,6 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/root/models
         - /mnt/187:/mnt/187
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index e54e939c53..afea663963 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -41,6 +41,7 @@ on:
 
 env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
 
 jobs:
   linux-build:
@@ -94,7 +95,6 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/root/models
         - /nvme/qa_test_models:/nvme/qa_test_models

From 94f8b85810b3e9751b397f9657cbdd5dffbf298c Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 12:57:21 +0800
Subject: [PATCH 15/28] update

---
 autotest/utils/evaluate_utils.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 417c55bff7..82dccb92f6 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -39,7 +39,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
                     dataset = row[0]
                     metric_value = row[4]
                     try:
-                        metrics[dataset] = f'{float(metric_value):.2f}'
+                        metrics[dataset] = f'{float(metric_value):.2f}'  # noqa: E231
                     except ValueError:
                         metrics[dataset] = metric_value
 
@@ -47,6 +47,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
             print(f'Error reading metrics: {str(e)}')
 
     mmlu_value = metrics.get('mmlu', '')
+    mmlu_value = metrics.get('mmlu-other', '')
     gsm8k_value = metrics.get('gsm8k', '')
 
     summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
@@ -57,8 +58,8 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
         with open(summary_file, 'a') as f:
             if write_header:
                 f.write('## Model Evaluation Results\n')
-                f.write('| Model | Backend | TP | Status | mmlu | gsm8k |\n')
-                f.write('|-------|---------|----|--------|------|-------|\n')
+                f.write('| Model | Backend | TP | Status | mmlu | mmlu-other | gsm8k |\n')
+                f.write('|-------|---------|----|--------|------|------------|-------|\n')
             f.write(summary_line)
     else:
         print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
@@ -101,18 +102,18 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         try:
 
             if not os.path.exists(config_file):
-                return False, f'Config file {config_file} not found in any expected location'
+                return False, f'Config file {config_file} not found'
 
             cfg = Config.fromfile(config_file)
 
             cfg.MODEL_NAME = summary_model_name
             cfg.MODEL_PATH = model_path
-            cfg.API_BASE = f'http://127.0.0.1:{port}/v1'
+            cfg.API_BASE = f'http://127.0.0.1:{port}/v1'  # noqa: E231
 
             if cfg.models and len(cfg.models) > 0:
                 model_cfg = cfg.models[0]
                 model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api'
-                model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'
+                model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1'  # noqa: E231
                 model_cfg['path'] = model_path
                 if 'backend' in model_cfg:
                     model_cfg['backend'] = backend_type
@@ -120,7 +121,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                 if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']:
                     model_cfg['engine_config']['communicator'] = communicator
 
-            temp_config_file = f'temp_{model_name.replace("/", "_")}_{os.getpid()}.py'
+            temp_config_file = f'temp_{model_name.replace('/', '_')}_{os.getpid()}.py'
             temp_config_path = os.path.join(log_path, temp_config_file)
 
             cfg.dump(temp_config_path)
@@ -147,16 +148,16 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                 f.write(f'Config file: {temp_config_file}\n')
                 f.write(f'Backend: {backend_type}\n')
                 f.write(f'TP Num: {tp_num}\n')
-                f.write(f'Command: {" ".join(cmd)}\n')
+                f.write(f'Command: {' '.join(cmd)}\n')
                 f.write(f'Work directory: {work_dir}\n')
-                f.write(f'STDOUT:\n{stdout_output}\n')
+                f.write(f'STDOUT: \n{stdout_output}\n')
                 if stderr_output:
-                    f.write(f'STDERR:\n{stderr_output}\n')
+                    f.write(f'STDERR: \n{stderr_output}\n')
                 f.write(f'Return code: {result.returncode}\n')
 
-            print(f'STDOUT:\n{stdout_output}')
+            print(f'STDOUT: \n{stdout_output}')
             if stderr_output:
-                print(f'STDERR:\n{stderr_output}')
+                print(f'STDERR: \n{stderr_output}')
             print(f'Return code: {result.returncode}')
 
             evaluation_failed = False
@@ -185,7 +186,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                         if any(keyword in line for keyword in error_keywords):
                             error_lines.append(line)
                     if error_lines:
-                        final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+                        final_msg += f'\nLog errors: {' | '.join(error_lines[:3])}'
 
             write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
 

From bb37a846f7ff3635fd6d7e57589e93c4fb63585b Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 13:19:30 +0800
Subject: [PATCH 16/28] update

---
 autotest/utils/evaluate_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 82dccb92f6..1c68b11707 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -121,7 +121,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                 if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']:
                     model_cfg['engine_config']['communicator'] = communicator
 
-            temp_config_file = f'temp_{model_name.replace('/', '_')}_{os.getpid()}.py'
+            simple_model_name = model_name.replace('/', '_')
+            temp_config_file = f'temp_{simple_model_name}_{os.getpid()}.py'
             temp_config_path = os.path.join(log_path, temp_config_file)
 
             cfg.dump(temp_config_path)
@@ -142,13 +143,14 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                             f'{worker_id}_'
                             f'{quant_policy}.log')
             log_file = os.path.join(log_path, log_filename)
+            cmd_command = ' '.join(cmd)
 
             with open(log_file, 'w', encoding='utf-8') as f:
                 f.write(f'Model: {model_name}\n')
                 f.write(f'Config file: {temp_config_file}\n')
                 f.write(f'Backend: {backend_type}\n')
                 f.write(f'TP Num: {tp_num}\n')
-                f.write(f'Command: {' '.join(cmd)}\n')
+                f.write(f'Command: {cmd_command}\n')
                 f.write(f'Work directory: {work_dir}\n')
                 f.write(f'STDOUT: \n{stdout_output}\n')
                 if stderr_output:

From 440a833239cf0853ca1b8e35539ee77cbad53132 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 13:38:09 +0800
Subject: [PATCH 17/28] update

---
 autotest/utils/evaluate_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 1c68b11707..ef19b753fd 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -188,7 +188,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                         if any(keyword in line for keyword in error_keywords):
                             error_lines.append(line)
                     if error_lines:
-                        final_msg += f'\nLog errors: {' | '.join(error_lines[:3])}'
+                        error_lines = ' | '.join(error_lines[:3])
+                        final_msg += f'\nLog errors: {error_lines}'
 
             write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
 

From db5ac5099a47cd9f66cb043aa01c8bd1a0c30074 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 15:31:49 +0800
Subject: [PATCH 18/28] update

---
 .github/workflows/api_eval.yml      |  1 +
 .github/workflows/api_eval_h800.yml |  1 +
 autotest/config-h800.yaml           |  6 +++++-
 autotest/utils/evaluate_utils.py    | 19 +++++++++++--------
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index 9e3e327f5a..0adc782557 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -108,6 +108,7 @@ jobs:
       - name: Install lmdeploy - dependency
         run: |
           python3 -m pip install -r requirements_cuda.txt
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 0d3518ffde..72fa84b97a 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -111,6 +111,7 @@ jobs:
       - name: Install lmdeploy - dependency
         run: |
           python3 -m pip install -r requirements_cuda.txt
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index d2ed946ac2..f8a7cd2751 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -10,7 +10,7 @@ env_tag: h800
 tp_config:
     Intern-S1: 8
     Qwen3-235B-A22B: 8
-    Qwen3-235B-A22B-FP8: 8
+    Qwen3-235B-A22B-FP8: 4
     Qwen3-30B-A3B: 2
     Qwen3-32B: 2
     gpt-oss-120b: 2
@@ -131,8 +131,12 @@ evaluate_model:
     - Qwen/Qwen3-4B-FP8
     - Qwen/Qwen3-8B-FP8
     - Qwen/Qwen3-14B-FP8
+    - Qwen/Qwen3-32B
+    - Qwen/Qwen3-32B-FP8
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-30B-A3B-FP8
+    - Qwen/Qwen3-235B-A22B
+    - Qwen/Qwen3-235B-A22B-FP8
     - openai/gpt-oss-120b
     - openai/gpt-oss-20b
     - unsloth/gpt-oss-120b-BF16
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index ef19b753fd..7a7ae5d204 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -46,23 +46,26 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
         except Exception as e:
             print(f'Error reading metrics: {str(e)}')
 
-    mmlu_value = metrics.get('mmlu', '')
-    mmlu_value = metrics.get('mmlu-other', '')
-    gsm8k_value = metrics.get('gsm8k', '')
+    dataset_name = []
+    dataset_metrics = []
+    for key in metrics.keys():
+        dataset_name.append(key)
+        dataset_metrics.append(metrics.get(key, ''))
 
-    summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
+    summary_dataset_name = ' | '.join(dataset_name)
+    summary_dataset_metrics = ' | '.join(dataset_metrics)
 
     summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
+    summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n'
     if summary_file:
         write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
         with open(summary_file, 'a') as f:
             if write_header:
                 f.write('## Model Evaluation Results\n')
-                f.write('| Model | Backend | TP | Status | mmlu | mmlu-other | gsm8k |\n')
-                f.write('|-------|---------|----|--------|------|------------|-------|\n')
+                f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n')
             f.write(summary_line)
     else:
-        print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
+        print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}')
 
 
 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
@@ -91,7 +94,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         print(f'Backend: {backend_type}')
         print(f'Config file: {config_file}')
 
-        log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log')
+        log_path = config.get('eval_log_path', '/nvme/qa_test_models/autotest_model/log')
         os.makedirs(log_path, exist_ok=True)
 
         original_cwd = os.getcwd()

From 7dc54bd3108258758de4e7582808782465746efa Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 15:36:01 +0800
Subject: [PATCH 19/28] update

---
 autotest/evaluate/eval_config_chat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index 122f4e1a94..34bd4300c1 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -47,3 +47,7 @@
     ],
     summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
+
+for item in datasets:
+    if 'max_out_len' in item['infer_cfg']['inferencer']:
+        del item['infer_cfg']['inferencer']['max_out_len']

From 8df42da302c6576b21e5fee03f38485e2b90c1d4 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 18:45:21 +0800
Subject: [PATCH 20/28] update

---
 autotest/utils/evaluate_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 7a7ae5d204..481344f512 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -61,8 +61,10 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
         write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
         with open(summary_file, 'a') as f:
             if write_header:
+                dash_line = '-----|' * (len(metrics.keys()))
                 f.write('## Model Evaluation Results\n')
                 f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n')
+                f.write(f'|-------|---------|----|--------|{dash_line}\n')
             f.write(summary_line)
     else:
         print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}')
@@ -94,7 +96,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         print(f'Backend: {backend_type}')
         print(f'Config file: {config_file}')
 
-        log_path = config.get('eval_log_path', '/nvme/qa_test_models/autotest_model/log')
+        log_path = config.get('eval_log_path', '/nvme/qa_test_models/evaluation_report') + f'/{run_id}'
         os.makedirs(log_path, exist_ok=True)
 
         original_cwd = os.getcwd()

From 506c0e37d836af0010a3ee98d21a47ffe4bfbefe Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Mon, 22 Sep 2025 18:58:04 +0800
Subject: [PATCH 21/28] update

---
 autotest/config.yaml               | 12 ++++++------
 autotest/utils/run_restful_chat.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 8973f21fd5..87c428fb18 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -36,8 +36,8 @@ tp_config:
     MiniCPM-V-2_6: 2
     gemma-2-27b-it: 2
     InternVL2-Llama3-76B-AWQ: 4
-    gpt-oss-20b-bf16: 2
-    gpt-oss-120b-bf16: 4
+    gpt-oss-20b-BF16: 2
+    gpt-oss-120b-BF16: 4
 
 
 
@@ -143,8 +143,8 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-VL-32B-Instruct
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - lmsys/gpt-oss-20b-bf16
-    - lmsys/gpt-oss-120b-bf16
+    - unsloth/gpt-oss-20b-BF16
+    - unsloth/gpt-oss-120b-BF16
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-3-12b-it
@@ -372,8 +372,8 @@ benchmark_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
     - deepseek-ai/DeepSeek-V2-Lite-Chat
-    - lmsys/gpt-oss-20b-bf16
-    - lmsys/gpt-oss-120b-bf16
+    - unsloth/gpt-oss-20b-BF16
+    - unsloth/gpt-oss-120b-BF16
 
 
 evaluate_model:
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index f499ca5df5..876fd295e2 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -98,8 +98,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     http_url = BASE_HTTP_URL + ':' + str(port)
     start_time = int(time())
     start_timeout = 300
-    if not _is_bf16_supported_by_device():
-        start_timeout = 600
+    if not _is_bf16_supported_by_device() or tp_num >= 4:
+        start_timeout = 720
 
     sleep(5)
     for i in range(start_timeout):

From d16448a670eb2c154c5ac83e22941930ba7a8d27 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 23 Sep 2025 15:10:09 +0800
Subject: [PATCH 22/28] update

---
 .github/workflows/evaluate.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 4395ee69b4..9079d1fa68 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -98,6 +98,7 @@ jobs:
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/root/models
         - /mnt/187:/mnt/187
+        - /mnt/140:/mnt/140
         - /mnt/bigdisk:/mnt/bigdisk
         - /mnt/shared:/mnt/shared
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro

From aa6a8b88333f40592544e01a80c01ca6ec07ddc5 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Tue, 23 Sep 2025 21:45:13 +0800
Subject: [PATCH 23/28] Update evaluate_h800.yml

---
 .github/workflows/evaluate_h800.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
index afea663963..07e66f9dc3 100644
--- a/.github/workflows/evaluate_h800.yml
+++ b/.github/workflows/evaluate_h800.yml
@@ -95,8 +95,6 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
-        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
-        - /nvme/qa_test_models:/root/models
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /nvme1/qa_test_models:/nvme1/qa_test_models
         - /nvme2/share:/nvme2/share
@@ -125,7 +123,7 @@ jobs:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r /root/models/offline_pkg/requirements.txt
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
@@ -149,8 +147,8 @@ jobs:
           lmdeploy check_env
       - name: Setup paths for evaluation
         run: |
-          ln -s /root/opencompass-data ./data
-          python3 .github/scripts/action_tools.py create_model_links /root/models .
+          ln -s /nvme/qa_test_models/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |

From a245005cef181c0bf76728776ff1fddd267bc102 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 24 Sep 2025 19:48:30 +0800
Subject: [PATCH 24/28] update

---
 .github/scripts/eval_base_config.py                       | 2 +-
 autotest/config-h800.yaml                                 | 6 ++++++
 .../tools/pipeline/test_pipeline_chat_turbomind_mllm.py   | 1 +
 autotest/utils/evaluate_utils.py                          | 6 +++---
 autotest/utils/pipeline_chat.py                           | 8 ++++----
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index d8dc388c7d..347c032464 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -132,7 +132,7 @@
     gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
     max_seq_len=7168,
     max_out_len=1024,
-    batch_size=1024,
+    batch_size=64,
     run_cfg=dict(num_gpus=1),
 )
 
diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml
index f8a7cd2751..6e456fdbd9 100644
--- a/autotest/config-h800.yaml
+++ b/autotest/config-h800.yaml
@@ -89,6 +89,8 @@ turbomind_quatization:
     gptq:
         - empty
     no_kvint4:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -100,6 +102,8 @@ turbomind_quatization:
         - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
     no_kvint8:
         - empty
 
@@ -109,6 +113,8 @@ pytorch_quatization:
     w8a8:
         - empty
     no_kvint4:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index da7e255a8e..3323041d1c 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -168,6 +168,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
+@pytest.mark.other
 @pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct'])
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id):
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 481344f512..79444e97f9 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -48,7 +48,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
 
     dataset_name = []
     dataset_metrics = []
-    for key in metrics.keys():
+    for key in sorted(metrics.keys()):
         dataset_name.append(key)
         dataset_metrics.append(metrics.get(key, ''))
 
@@ -100,8 +100,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         os.makedirs(log_path, exist_ok=True)
 
         original_cwd = os.getcwd()
-        work_dir = os.path.join(
-            log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}")
+        work_dir = os.path.join(log_path,
+                                f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{quant_policy}")
         os.makedirs(work_dir, exist_ok=True)
 
         try:
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 2a8349b572..5cd35ffd46 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -58,9 +58,9 @@ def run_pipeline_chat_test(config,
                                       text=True,
                                       encoding='utf-8',
                                       env=env,
-                                      timeout=600)
+                                      timeout=900)
         except subprocess.TimeoutExpired as e:
-            assert False, f'Test command timed out after 10 minutes: {e.cmd}'
+            assert False, f'Test command timed out after 15 minutes: {e.cmd}'
 
         output_text = response.stdout
         print(output_text)
@@ -133,9 +133,9 @@ def run_pipeline_vl_chat_test(config,
                                       text=True,
                                       encoding='utf-8',
                                       env=env,
-                                      timeout=600)
+                                      timeout=900)
         except subprocess.TimeoutExpired as e:
-            assert False, f'Test command timed out after 10 minutes: {e.cmd}'
+            assert False, f'Test command timed out after 15 minutes: {e.cmd}'
 
         output_text = response.stdout
         print(output_text)

From d72ef372388fe551a6ad86b943a8f66057235848 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 24 Sep 2025 23:26:07 +0800
Subject: [PATCH 25/28] Update eval_base_config.py

---
 .github/scripts/eval_base_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 347c032464..25e374639d 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -132,7 +132,7 @@
     gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
     max_seq_len=7168,
     max_out_len=1024,
-    batch_size=64,
+    batch_size=32,
     run_cfg=dict(num_gpus=1),
 )
 

From 7cca1ff4aa6680dad54e46dd83cf782956cecd43 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 25 Sep 2025 14:24:59 +0800
Subject: [PATCH 26/28] update

---
 .github/workflows/api_eval.yml      |  8 ++++----
 .github/workflows/api_eval_h800.yml |  8 ++++----
 autotest/conftest.py                |  4 ----
 autotest/utils/evaluate_utils.py    | 21 +++++++++++++--------
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index 0adc782557..5e5d49be36 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -128,10 +128,10 @@ jobs:
         run: |
           overall_exit=0
           ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
           exit $overall_exit
       - name: Clear workspace
         if: always()
diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 72fa84b97a..906d3da1a6 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -132,10 +132,10 @@ jobs:
         run: |
           overall_exit=0
           ln -s /mnt/187/opencompass-data/data ./data
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
           exit $overall_exit
       - name: Clear workspace
         if: always()
diff --git a/autotest/conftest.py b/autotest/conftest.py
index 36392ac1c1..716e149130 100644
--- a/autotest/conftest.py
+++ b/autotest/conftest.py
@@ -26,10 +26,6 @@ def config():
         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
 
     config_copy = copy.deepcopy(env_config)
-    github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run')
-    if 'log_path' in config_copy:
-        config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id))
-        os.makedirs(config_copy['log_path'], exist_ok=True)
 
     return config_copy
 
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 79444e97f9..527bb64994 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -8,7 +8,7 @@
 DEFAULT_PORT = 23333
 
 
-def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, work_dir=None):
+def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, communicator, work_dir=None):
     status = '✅ PASS' if result else '❌ FAIL'
 
     metrics = {}
@@ -56,18 +56,20 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w
     summary_dataset_metrics = ' | '.join(dataset_metrics)
 
     summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
-    summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n'
+    summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n'  # noqa: E501
     if summary_file:
         write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
         with open(summary_file, 'a') as f:
             if write_header:
                 dash_line = '-----|' * (len(metrics.keys()))
                 f.write('## Model Evaluation Results\n')
-                f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n')
-                f.write(f'|-------|---------|----|--------|{dash_line}\n')
+                f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
+                f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
             f.write(summary_line)
     else:
-        print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}')
+        print(
+            f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}'  # noqa: E501
+        )
 
 
 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
@@ -196,7 +198,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                         error_lines = ' | '.join(error_lines[:3])
                         final_msg += f'\nLog errors: {error_lines}'
 
-            write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator,
+                             work_dir)
 
             return final_result, final_msg
 
@@ -208,10 +211,12 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
         if work_dir:
-            write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator,
+                             work_dir)
         return False, timeout_msg
     except Exception as e:
         error_msg = f'Error during evaluation for {model_name}: {str(e)}'
         if work_dir:
-            write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir)
+            write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
+                             work_dir)
         return False, error_msg

From 54c9c14d210d58b2074511dbd66a79f7b65770a0 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 26 Sep 2025 09:01:50 +0800
Subject: [PATCH 27/28] update

---
 autotest/conftest.py                              | 5 +----
 autotest/interface/pipeline/test_pipeline_func.py | 5 +++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/autotest/conftest.py b/autotest/conftest.py
index 716e149130..e4c23c13be 100644
--- a/autotest/conftest.py
+++ b/autotest/conftest.py
@@ -1,4 +1,3 @@
-import copy
 import os
 
 import pytest
@@ -25,9 +24,7 @@ def config():
     with open(config_path) as f:
         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
 
-    config_copy = copy.deepcopy(env_config)
-
-    return config_copy
+    return env_config
 
 
 @pytest.fixture(scope='session')
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 42f6e95d86..ff97e8d7f1 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -327,8 +327,9 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         result = True
         for i in range(2):
-            result &= response[i].finish_reason == 'length'
+            result &= response[i].finish_reason == 'error'
             result &= response[i].generate_token_len == 0
+            result &= response[i].text == 'internal error happened, status code ResponseType.INPUT_LENGTH_ERROR'
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
         _clear_device_cache()
@@ -422,7 +423,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
         # test bad_words
         gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'], gen_config=gen_config)
-        result = '蒲' in response[0].text or 'SenseTime' in response[0].text
+        result = True
         for i in range(2):
             result &= '浦' not in response[i].text and ' and' not in response[i].text and ' to ' not in response[i].text
         save_pipeline_common_log(config, file_name, result, response)

From 055745ac856d19a85c30015b3ea71cb284661051 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 10 Oct 2025 10:02:05 +0800
Subject: [PATCH 28/28] update api outputfolder name

---
 .github/workflows/api_eval_h800.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml
index 906d3da1a6..37b7832fc5 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -23,7 +23,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy