From 759dca37eeb9480daeaafdfe337d37d778755b69 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 18 Sep 2025 16:15:21 +0800 Subject: [PATCH 01/28] TEST: add api evaluate --- .github/workflows/api_eva.yml | 137 +++++++++++++++ autotest/config.yaml | 22 +++ autotest/conftest.py | 10 +- autotest/evaluate/eval_config_base.py | 46 +++++ autotest/evaluate/eval_config_chat.py | 41 +++++ .../evaluate/test_api_evaluate_pytorch.py | 97 +++++++++++ .../evaluate/test_api_evaluate_turbomind.py | 97 +++++++++++ autotest/utils/config_utils.py | 87 +++++++++- autotest/utils/evaluate_utils.py | 163 ++++++++++++++++++ autotest/utils/run_restful_chat.py | 4 +- 10 files changed, 701 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/api_eva.yml create mode 100644 autotest/evaluate/eval_config_base.py create mode 100644 autotest/evaluate/eval_config_chat.py create mode 100644 autotest/evaluate/test_api_evaluate_pytorch.py create mode 100644 autotest/evaluate/test_api_evaluate_turbomind.py create mode 100644 autotest/utils/evaluate_utils.py diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml new file mode 100644 index 0000000000..782158bea4 --- /dev/null +++ b/.github/workflows/api_eva.yml @@ -0,0 +1,137 @@ +name: api_eva + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM/lmdeploy' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: '--lf' + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + +jobs: + linux-build: + if: ${{ !cancelled() }} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.4 + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + test_evaluation: + needs: linux-build + if: ${{ !cancelled() }} + runs-on: [self-hosted, test-140] + timeout-minutes: 2400 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + - /mnt/187:/mnt/187 + steps: + - name: Create and change to _wk directory + run: | + echo "Working directory set to: $(pwd)" + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r requirements_cuda.txt + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + python3 -m pip install opencompass + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + mkdir -p ${{ env.REPORT_DIR }}/.pytest_cache + ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest + - name: Setup paths for evaluation + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') + run: | + overall_exit=0 + ln -s /mnt/187/opencompass-data/data ./data + pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + exit $overall_exit + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + rm -rf $workdir/* \ No newline at end of file diff --git a/autotest/config.yaml b/autotest/config.yaml index fab9a5af89..8dddb7ecdc 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -6,6 +6,7 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json env_tag: a100 + tp_config: Llama-4-Scout-17B-16E-Instruct: 4 Meta-Llama-3-1-70B-Instruct: 4 @@ -22,6 +23,7 @@ tp_config: Qwen3-32B: 2 Qwen3-30B-A3B: 2 Qwen3-30B-A3B-Base: 2 + Qwen2.5-32B-Instruct : 2 Qwen2.5-72B-Instruct: 4 Qwen2.5-VL-32B-Instruct: 2 DeepSeek-V2-Lite-Chat: 2 @@ -37,6 +39,7 @@ tp_config: gpt-oss-120b: 4 + turbomind_chat_model: - meta-llama/Llama-3.2-1B-Instruct - meta-llama/Llama-3.2-3B-Instruct @@ -72,6 +75,7 @@ turbomind_chat_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen2.5-72B-Instruct + - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct @@ -134,6 +138,8 @@ pytorch_chat_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen2.5-72B-Instruct + - Qwen/Qwen2-7B-Instruct + - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct @@ -370,3 +376,19 @@ benchmark_model: - deepseek-ai/DeepSeek-V2-Lite-Chat - openai/gpt-oss-20b - openai/gpt-oss-120b + + +evaluate_model: + - Qwen/Qwen1.5-7B-Chat + - google/gemma-2-9b-it + - google/gemma-2-27b-it + - internlm/internlm2_5-7b-chat + - internlm/internlm3-8b-instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct + - Qwen/Qwen2-7B-Instruct + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-32B-Instruct + - meta-llama/Llama-2-7b-chat-hf + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - internlm/internlm2_5-20b-chat diff --git a/autotest/conftest.py b/autotest/conftest.py index dee954d2cb..36392ac1c1 100644 --- a/autotest/conftest.py +++ b/autotest/conftest.py @@ -1,3 +1,4 @@ +import copy import os import pytest @@ -23,7 +24,14 @@ def config(): with open(config_path) as f: env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) - return env_config + + config_copy = copy.deepcopy(env_config) + github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run') + if 'log_path' in config_copy: + config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id)) + os.makedirs(config_copy['log_path'], exist_ok=True) + + return config_copy @pytest.fixture(scope='session') diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py new file mode 100644 index 0000000000..80a68fb274 --- /dev/null +++ b/autotest/evaluate/eval_config_base.py @@ -0,0 +1,46 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK + +with read_base(): + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ + winogrande_datasets # noqa: F401, E501 + +race_datasets = [race_datasets[1]] +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + +MODEL_NAME = 'internlm2_5-1_8b' +MODEL_PATH = '/nvme/qa_test_models/internlm/internlm2_5-1_8b' +API_BASE = 'http://127.0.0.1:23333/v1' + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + +models = [ + dict( + type=OpenAISDK, + abbr=f'{MODEL_NAME}-lmdeploy-api', + openai_api_base=API_BASE, + key='EMPTY', + path=MODEL_PATH, + meta_template=api_meta_template, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1, communicator='native'), + temperature=0.1, + ) +] + +summarizer = dict( + dataset_abbrs=[ + ['gsm8k', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['race-high', 'accuracy'], + ['winogrande', 'accuracy'], + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py new file mode 100644 index 0000000000..ac7d1ed54c --- /dev/null +++ b/autotest/evaluate/eval_config_chat.py @@ -0,0 +1,41 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK + +with read_base(): + from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501 + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) + + +MODEL_NAME = 'Qwen2-7B-Instruct' +MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct' +API_BASE = 'http://127.0.0.1:65525/v1' + + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + +models = [ + dict( + type=OpenAISDK, + abbr=f'{MODEL_NAME}-lmdeploy-api', + openai_api_base=API_BASE, + key='EMPTY', + path=MODEL_PATH, + meta_template=api_meta_template, + max_out_len=2048, + temperature=0.1, + ) +] + +summarizer = dict( + dataset_abbrs=[ + ['mmlu', 'naive_average'], + ['gsm8k', 'accuracy'], + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py new file mode 100644 index 0000000000..7be55b505d --- /dev/null +++ b/autotest/evaluate/test_api_evaluate_pytorch.py @@ -0,0 +1,97 @@ +import pytest +from utils.config_utils import get_evaluate_pytorch_model_list, get_workerid +from utils.evaluate_utils import restful_test +from utils.run_restful_chat import start_restful_api, stop_restful_api + +DEFAULT_PORT = 23333 + + +@pytest.fixture(scope='function', autouse=True) +def prepare_environment(request, config, worker_id): + param = request.param + model = param['model'] + backend = param['backend'] + print(param['model'], param['backend'], param['extra']) + model_path = config.get('model_path') + '/' + model + pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) + yield param + stop_restful_api(pid, startRes, param) + + +def getModelList(tp_num): + model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8]) + new_model_list = [] + for model in model_list: + if model['backend'] == 'pytorch': + model['extra'] += '--cache-max-entry-count 0.8' + elif 'Llama-2' in model['model']: + model['extra'] += '--cache-max-entry-count 0.95' + elif 'internlm2' in model['model']: + model['extra'] += '--cache-max-entry-count 0.9' + model['cuda_prefix'] = None + new_model_list.append(model) + return new_model_list + + +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) +def test_restful_tp1(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) +def test_restful_tp2(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_4 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) +def test_restful_tp4(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_8 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True) +def test_restful_tp8(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py new file mode 100644 index 0000000000..943d7eb78c --- /dev/null +++ b/autotest/evaluate/test_api_evaluate_turbomind.py @@ -0,0 +1,97 @@ +import pytest +from utils.config_utils import get_evaluate_turbomind_model_list, get_workerid +from utils.evaluate_utils import restful_test +from utils.run_restful_chat import start_restful_api, stop_restful_api + +DEFAULT_PORT = 23333 + + +@pytest.fixture(scope='function', autouse=True) +def prepare_environment(request, config, worker_id): + param = request.param + model = param['model'] + backend = param['backend'] + print(param['model'], param['backend'], param['extra']) + model_path = config.get('model_path') + '/' + model + pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) + yield param + stop_restful_api(pid, startRes, param) + + +def getModelList(tp_num): + model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8]) + new_model_list = [] + for model in model_list: + if model['backend'] == 'pytorch': + model['extra'] += '--cache-max-entry-count 0.8' + elif 'Llama-2' in model['model']: + model['extra'] += '--cache-max-entry-count 0.95' + elif 'internlm2' in model['model']: + model['extra'] += '--cache-max-entry-count 0.9' + model['cuda_prefix'] = None + new_model_list.append(model) + return new_model_list + + +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) +def test_restful_tp1(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) +def test_restful_tp2(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_4 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) +def test_restful_tp4(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg + + +@pytest.mark.gpu_num_8 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True) +def test_restful_tp8(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + assert result, msg diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 0df8858b2c..9a403655ec 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -176,7 +176,14 @@ def get_config(): with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) - return config + + config_copy = copy.deepcopy(config) + github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run') + if 'log_path' in config_copy: + config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id)) + os.makedirs(config_copy['log_path'], exist_ok=True) + + return config_copy def get_benchmark_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []): @@ -227,6 +234,84 @@ def get_benchmark_model_list(tp_num, is_longtext: bool = False, kvint_list: list return result +def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []): + config = get_config() + + if is_longtext: + case_list_base = [item for item in config.get('longtext_model', [])] + else: + case_list_base = config.get('evaluate_model', config.get('benchmark_model', [])) + quatization_case_config = config.get('turbomind_quatization') + + case_list = copy.deepcopy(case_list_base) + for key in case_list_base: + if key in config.get('turbomind_chat_model') and key not in quatization_case_config.get( + 'no_awq') and not is_quantization_model(key): + case_list.append(key + '-inner-4bits') + + model_list = [item for item in case_list if get_tp_num(config, item) == tp_num] + + result = [] + if len(model_list) > 0: + + communicators = ['native', 'nccl'] + for communicator in communicators: + for item in model_list: + if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace( + '-inner-4bits', '') in config.get('turbomind_base_model'): + model_config = { + 'model': item, + 'backend': 'turbomind', + 'communicator': communicator, + 'quant_policy': 0, + 'tp_num': tp_num, + 'extra': f'--communicator {communicator} ' + } + result.append(model_config) + + for kvint in kvint_list: + for item in model_list: + if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') and item.replace( + '-inner-4bits', '') not in quatization_case_config.get('no_kvint' + str(kvint)): + model_config = { + 'model': item, + 'backend': 'turbomind', + 'quant_policy': kvint, + 'tp_num': tp_num, + 'extra': '' + } + result.append(model_config) + return result + + +def get_evaluate_pytorch_model_list(tp_num, is_longtext: bool = False, kvint_list: list = []): + config = get_config() + + if is_longtext: + case_list_base = [item for item in config.get('longtext_model', [])] + else: + case_list_base = config.get('evaluate_model', config.get('benchmark_model', [])) + pytorch_quatization_case_config = config.get('pytorch_quatization') + + case_list = copy.deepcopy(case_list_base) + + for key in case_list_base: + if key in config.get('pytorch_chat_model') and key in pytorch_quatization_case_config.get( + 'w8a8') and not is_quantization_model(key): + case_list.append(key + '-inner-w8a8') + + model_list = [item for item in case_list if get_tp_num(config, item) == tp_num] + + result = [] + if len(model_list) > 0: + for item in model_list: + if '4bits' not in item and (item.replace('-inner-w8a8', '') in config.get('pytorch_chat_model') + or item.replace('-inner-w8a8', '') in config.get('pytorch_base_model')): + model_config = {'model': item, 'backend': 'pytorch', 'tp_num': tp_num, 'extra': ''} + result.append(model_config) + return result + + def get_workerid(worker_id): if worker_id is None or 'gw' not in worker_id: return None diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py new file mode 100644 index 0000000000..edc352b94f --- /dev/null +++ b/autotest/utils/evaluate_utils.py @@ -0,0 +1,163 @@ +import os +import subprocess + +from mmengine.config import Config + +DEFAULT_PORT = 23333 + + +def get_model_type(model_name): + model_name_lower = model_name.lower() + + chat_patterns = [ + 'chat', + 'instruct', + 'gemma', + 'llama3', + 'llama2', + 'llama', + ] + if any(pattern in model_name_lower for pattern in chat_patterns): + return 'chat' + else: + return 'base' + + +def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): + try: + model_name = prepare_environment['model'] + backend_type = prepare_environment['backend'] + tp_num = prepare_environment.get('tp_num', 1) + communicator = prepare_environment.get('communicator', 'native') + quant_policy = prepare_environment.get('quant_policy', 0) + + model_type = get_model_type(model_name) + print(f'Model {model_name} identified as {model_type} model') + + current_dir = os.path.dirname(os.path.abspath(__file__)) + parent_dir = os.path.dirname(current_dir) + + if model_type == 'base': + config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py') + else: + config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py') + + model_base_path = config.get('model_path', '/nvme/qa_test_models') + model_path = os.path.join(model_base_path, model_name) + + print(f'Starting OpenCompass evaluation for model: {model_name}') + print(f'Model path: {model_path}') + print(f'Backend: {backend_type}') + print(f'Model type: {model_type}') + print(f'Config file: {config_file}') + + log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log') + os.makedirs(log_path, exist_ok=True) + + original_cwd = os.getcwd() + work_dir = os.path.join( + log_path, + f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}") + os.makedirs(work_dir, exist_ok=True) + + try: + + if not os.path.exists(config_file): + return False, f'Config file {config_file} not found in any expected location' + + cfg = Config.fromfile(config_file) + + cfg.MODEL_NAME = model_name + cfg.MODEL_PATH = model_path + cfg.API_BASE = f'http://127.0.0.1:{port}/v1' + + if cfg.models and len(cfg.models) > 0: + model_cfg = cfg.models[0] + model_cfg['abbr'] = f'{model_name}-lmdeploy-api' + model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' + model_cfg['path'] = model_path + if 'backend' in model_cfg: + model_cfg['backend'] = backend_type + + if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']: + model_cfg['engine_config']['communicator'] = communicator + + temp_config_file = f'temp_{model_name.replace("/", "_")}_{os.getpid()}.py' + temp_config_path = os.path.join(log_path, temp_config_file) + + cfg.dump(temp_config_path) + print(f'Modified config saved to: {temp_config_path}') + + cmd = ['opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir] + print(f"Running command: {' '.join(cmd)}") + print(f'Work directory: {work_dir}') + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=72000) + + stdout_output = result.stdout + stderr_output = result.stderr + + log_filename = (f'eval_{backend_type}_' + f"{model_name.replace('/', '_')}_" + f'{model_type}_' + f'{communicator}_' + f'{worker_id}_' + f'{quant_policy}.log') + log_file = os.path.join(log_path, log_filename) + + with open(log_file, 'w', encoding='utf-8') as f: + f.write(f'Model: {model_name}\n') + f.write(f'Model type: {model_type}\n') + f.write(f'Config file: {temp_config_file}\n') + f.write(f'Backend: {backend_type}\n') + f.write(f'TP Num: {tp_num}\n') + f.write(f'Command: {" ".join(cmd)}\n') + f.write(f'Work directory: {work_dir}\n') + f.write(f'STDOUT:\n{stdout_output}\n') + if stderr_output: + f.write(f'STDERR:\n{stderr_output}\n') + f.write(f'Return code: {result.returncode}\n') + + print(f'STDOUT:\n{stdout_output}') + if stderr_output: + print(f'STDERR:\n{stderr_output}') + print(f'Return code: {result.returncode}') + + evaluation_failed = False + error_keywords = ['ERROR -', 'fail, see', 'task .* fail'] + for line in stdout_output.split('\n'): + if any(keyword in line for keyword in error_keywords): + evaluation_failed = True + break + + if result.returncode == 0 and not evaluation_failed: + return True, f'Evaluation completed successfully for {model_name} ({model_type})' + else: + error_msg = f'Evaluation failed for {model_name} ({model_type}) ' + if result.returncode != 0: + error_msg += f'with return code {result.returncode}' + elif evaluation_failed: + error_msg += 'with internal errors detected in logs' + + if stderr_output: + error_msg += f'\nSTDERR: {stderr_output}' + else: + error_lines = [] + for line in stdout_output.split('\n'): + if any(keyword in line for keyword in error_keywords): + error_lines.append(line) + if error_lines: + error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' + + return False, error_msg + + finally: + os.chdir(original_cwd) + print(f'Returned to directory: {original_cwd}') + + except subprocess.TimeoutExpired: + timeout_msg = (f'Evaluation timed out for {model_name} ' + f'after 7200 seconds') + return False, timeout_msg + except Exception as e: + return False, f'Error during evaluation for {model_name}: {str(e)}' \ No newline at end of file diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 5aca937681..cc67c559cc 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -1,3 +1,4 @@ +import datetime import json import os import subprocess @@ -84,7 +85,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if str(config.get('env_tag')) == '3090' or str(config.get('env_tag')) == '5080': cmd += ' --cache-max-entry-count 0.5' - start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log') + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '_' + timestamp + '.log') print('reproduce command restful: ' + cmd) From a955b7da069b3edbafdce83bc9f4af8eaffcdfb5 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 18 Sep 2025 17:02:14 +0800 Subject: [PATCH 02/28] TEST: rm qwen1.5_7b test --- .github/workflows/api_eva.yml | 2 +- autotest/config.yaml | 2 -- autotest/evaluate/eval_config_base.py | 2 +- autotest/evaluate/eval_config_chat.py | 4 +--- autotest/evaluate/test_api_evaluate_pytorch.py | 1 - autotest/evaluate/test_api_evaluate_turbomind.py | 1 - autotest/utils/evaluate_utils.py | 2 +- 7 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eva.yml index 782158bea4..91e596e648 100644 --- a/.github/workflows/api_eva.yml +++ b/.github/workflows/api_eva.yml @@ -134,4 +134,4 @@ jobs: if: always() run: | export workdir=$(pwd) - rm -rf $workdir/* \ No newline at end of file + rm -rf $workdir/* diff --git a/autotest/config.yaml b/autotest/config.yaml index 8dddb7ecdc..637da0c6e3 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -139,7 +139,6 @@ pytorch_chat_model: - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen2.5-72B-Instruct - Qwen/Qwen2-7B-Instruct - - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct @@ -379,7 +378,6 @@ benchmark_model: evaluate_model: - - Qwen/Qwen1.5-7B-Chat - google/gemma-2-9b-it - google/gemma-2-27b-it - internlm/internlm2_5-7b-chat diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py index 80a68fb274..6a193b0402 100644 --- a/autotest/evaluate/eval_config_base.py +++ b/autotest/evaluate/eval_config_base.py @@ -25,7 +25,7 @@ type=OpenAISDK, abbr=f'{MODEL_NAME}-lmdeploy-api', openai_api_base=API_BASE, - key='EMPTY', + key='EMPTY', path=MODEL_PATH, meta_template=api_meta_template, max_out_len=2048, diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index ac7d1ed54c..c24cbb66b5 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -8,12 +8,10 @@ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) - MODEL_NAME = 'Qwen2-7B-Instruct' MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct' API_BASE = 'http://127.0.0.1:65525/v1' - api_meta_template = dict(round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), @@ -24,7 +22,7 @@ type=OpenAISDK, abbr=f'{MODEL_NAME}-lmdeploy-api', openai_api_base=API_BASE, - key='EMPTY', + key='EMPTY', path=MODEL_PATH, meta_template=api_meta_template, max_out_len=2048, diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py index 7be55b505d..d5c6f99447 100644 --- a/autotest/evaluate/test_api_evaluate_pytorch.py +++ b/autotest/evaluate/test_api_evaluate_pytorch.py @@ -11,7 +11,6 @@ def prepare_environment(request, config, worker_id): param = request.param model = param['model'] backend = param['backend'] - print(param['model'], param['backend'], param['extra']) model_path = config.get('model_path') + '/' + model pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) yield param diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py index 943d7eb78c..70c6809ca4 100644 --- a/autotest/evaluate/test_api_evaluate_turbomind.py +++ b/autotest/evaluate/test_api_evaluate_turbomind.py @@ -11,7 +11,6 @@ def prepare_environment(request, config, worker_id): param = request.param model = param['model'] backend = param['backend'] - print(param['model'], param['backend'], param['extra']) model_path = config.get('model_path') + '/' + model pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) yield param diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index edc352b94f..45d1225cc7 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -160,4 +160,4 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA f'after 7200 seconds') return False, timeout_msg except Exception as e: - return False, f'Error during evaluation for {model_name}: {str(e)}' \ No newline at end of file + return False, f'Error during evaluation for {model_name}: {str(e)}' From aa8a0bd656db565adba0e3f60d5d63f7e2cfdb8f Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 18 Sep 2025 17:49:43 +0800 Subject: [PATCH 03/28] TEST: add evaluate result to github --- autotest/config.yaml | 3 - autotest/evaluate/eval_config_base.py | 46 -------- autotest/evaluate/eval_config_chat.py | 6 +- .../evaluate/test_api_evaluate_pytorch.py | 6 - .../evaluate/test_api_evaluate_turbomind.py | 6 - autotest/utils/config_utils.py | 5 +- autotest/utils/evaluate_utils.py | 105 ++++++++++++------ 7 files changed, 79 insertions(+), 98 deletions(-) delete mode 100644 autotest/evaluate/eval_config_base.py diff --git a/autotest/config.yaml b/autotest/config.yaml index 637da0c6e3..379fd666bb 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -75,7 +75,6 @@ turbomind_chat_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen2.5-72B-Instruct - - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct @@ -138,7 +137,6 @@ pytorch_chat_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen2.5-72B-Instruct - - Qwen/Qwen2-7B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct @@ -384,7 +382,6 @@ evaluate_model: - internlm/internlm3-8b-instruct - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Meta-Llama-3-1-8B-Instruct - - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - meta-llama/Llama-2-7b-chat-hf diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py deleted file mode 100644 index 6a193b0402..0000000000 --- a/autotest/evaluate/eval_config_base.py +++ /dev/null @@ -1,46 +0,0 @@ -from mmengine.config import read_base -from opencompass.models import OpenAISDK - -with read_base(): - from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets # noqa: F401, E501 - from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets # noqa: F401, E501 - from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets # noqa: F401, E501 - from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ - winogrande_datasets # noqa: F401, E501 - -race_datasets = [race_datasets[1]] -datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) - -MODEL_NAME = 'internlm2_5-1_8b' -MODEL_PATH = '/nvme/qa_test_models/internlm/internlm2_5-1_8b' -API_BASE = 'http://127.0.0.1:23333/v1' - -api_meta_template = dict(round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), -]) - -models = [ - dict( - type=OpenAISDK, - abbr=f'{MODEL_NAME}-lmdeploy-api', - openai_api_base=API_BASE, - key='EMPTY', - path=MODEL_PATH, - meta_template=api_meta_template, - max_out_len=2048, - batch_size=16, - run_cfg=dict(num_gpus=1, communicator='native'), - temperature=0.1, - ) -] - -summarizer = dict( - dataset_abbrs=[ - ['gsm8k', 'accuracy'], - ['GPQA_diamond', 'accuracy'], - ['race-high', 'accuracy'], - ['winogrande', 'accuracy'], - ], - summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), -) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index c24cbb66b5..8d55ec232e 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -8,9 +8,9 @@ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) -MODEL_NAME = 'Qwen2-7B-Instruct' -MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct' -API_BASE = 'http://127.0.0.1:65525/v1' +MODEL_NAME = '' +MODEL_PATH = '' +API_BASE = '' api_meta_template = dict(round=[ dict(role='HUMAN', api_role='HUMAN'), diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py index d5c6f99447..79d77bade0 100644 --- a/autotest/evaluate/test_api_evaluate_pytorch.py +++ b/autotest/evaluate/test_api_evaluate_pytorch.py @@ -21,12 +21,6 @@ def getModelList(tp_num): model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8]) new_model_list = [] for model in model_list: - if model['backend'] == 'pytorch': - model['extra'] += '--cache-max-entry-count 0.8' - elif 'Llama-2' in model['model']: - model['extra'] += '--cache-max-entry-count 0.95' - elif 'internlm2' in model['model']: - model['extra'] += '--cache-max-entry-count 0.9' model['cuda_prefix'] = None new_model_list.append(model) return new_model_list diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py index 70c6809ca4..38a838ff8b 100644 --- a/autotest/evaluate/test_api_evaluate_turbomind.py +++ b/autotest/evaluate/test_api_evaluate_turbomind.py @@ -21,12 +21,6 @@ def getModelList(tp_num): model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8]) new_model_list = [] for model in model_list: - if model['backend'] == 'pytorch': - model['extra'] += '--cache-max-entry-count 0.8' - elif 'Llama-2' in model['model']: - model['extra'] += '--cache-max-entry-count 0.95' - elif 'internlm2' in model['model']: - model['extra'] += '--cache-max-entry-count 0.9' model['cuda_prefix'] = None new_model_list.append(model) return new_model_list diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 9a403655ec..fae0dbb6b9 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -254,7 +254,10 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l result = [] if len(model_list) > 0: - communicators = ['native', 'nccl'] + if tp_num > 1: + communicators = ['native', 'nccl'] + else: + communicators = ['native'] for communicator in communicators: for item in model_list: if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace( diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 45d1225cc7..9ecc0dc724 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -1,3 +1,5 @@ +import csv +import glob import os import subprocess @@ -6,24 +8,62 @@ DEFAULT_PORT = 23333 -def get_model_type(model_name): - model_name_lower = model_name.lower() +def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None): + status = '✅ PASS' if result else '❌ FAIL' - chat_patterns = [ - 'chat', - 'instruct', - 'gemma', - 'llama3', - 'llama2', - 'llama', - ] - if any(pattern in model_name_lower for pattern in chat_patterns): - return 'chat' + metrics = {} + + if work_dir and os.path.exists(work_dir): + try: + summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary')) + if summary_dirs: + summary_dir = summary_dirs[0] + csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv')) + if csv_files: + csv_file = sorted(csv_files)[-1] + if os.path.exists(csv_file): + with open(csv_file, 'r') as f: + reader = csv.reader(f) + next(reader) + for row in reader: + if len(row) >= 5 and row[4]: + dataset = row[0] + metric_value = row[4] + try: + metrics[dataset] = f'{float(metric_value):.2f}' + except ValueError: + metrics[dataset] = metric_value + except Exception as e: + print(f'Error reading metrics: {str(e)}') + + mmlu_value = metrics.get('mmlu', '') + gsm8k_value = metrics.get('gsm8k', '') + + summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n' + + summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None) + if summary_file: + write_header = False + if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0: + write_header = True + else: + with open(summary_file, 'r') as f: + first_lines = f.read(200) + if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines: + write_header = True + + with open(summary_file, 'a') as f: + if write_header: + f.write('## Model Evaluation Results\n') + f.write('| Model | TP | Status | mmlu | gsm8k |\n') + f.write('|-------|----|--------|------|-------|\n') + f.write(summary_line) else: - return 'base' + print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}') def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): + work_dir = None try: model_name = prepare_environment['model'] backend_type = prepare_environment['backend'] @@ -31,16 +71,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA communicator = prepare_environment.get('communicator', 'native') quant_policy = prepare_environment.get('quant_policy', 0) - model_type = get_model_type(model_name) - print(f'Model {model_name} identified as {model_type} model') - current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) - if model_type == 'base': - config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py') - else: - config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py') + config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py') model_base_path = config.get('model_path', '/nvme/qa_test_models') model_path = os.path.join(model_base_path, model_name) @@ -48,7 +82,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA print(f'Starting OpenCompass evaluation for model: {model_name}') print(f'Model path: {model_path}') print(f'Backend: {backend_type}') - print(f'Model type: {model_type}') print(f'Config file: {config_file}') log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log') @@ -56,8 +89,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA original_cwd = os.getcwd() work_dir = os.path.join( - log_path, - f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}") + log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}") os.makedirs(work_dir, exist_ok=True) try: @@ -99,7 +131,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA log_filename = (f'eval_{backend_type}_' f"{model_name.replace('/', '_')}_" - f'{model_type}_' f'{communicator}_' f'{worker_id}_' f'{quant_policy}.log') @@ -107,7 +138,6 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA with open(log_file, 'w', encoding='utf-8') as f: f.write(f'Model: {model_name}\n') - f.write(f'Model type: {model_type}\n') f.write(f'Config file: {temp_config_file}\n') f.write(f'Backend: {backend_type}\n') f.write(f'TP Num: {tp_num}\n') @@ -131,25 +161,29 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA break if result.returncode == 0 and not evaluation_failed: - return True, f'Evaluation completed successfully for {model_name} ({model_type})' + final_result = True + final_msg = f'Evaluation completed successfully for {model_name}' else: - error_msg = f'Evaluation failed for {model_name} ({model_type}) ' + final_result = False + final_msg = f'Evaluation failed for {model_name}' if result.returncode != 0: - error_msg += f'with return code {result.returncode}' + final_msg += f'with return code {result.returncode}' elif evaluation_failed: - error_msg += 'with internal errors detected in logs' + final_msg += 'with internal errors detected in logs' if stderr_output: - error_msg += f'\nSTDERR: {stderr_output}' + final_msg += f'\nSTDERR: {stderr_output}' else: error_lines = [] for line in stdout_output.split('\n'): if any(keyword in line for keyword in error_keywords): error_lines.append(line) if error_lines: - error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' + final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' + + write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir) - return False, error_msg + return final_result, final_msg finally: os.chdir(original_cwd) @@ -158,6 +192,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA except subprocess.TimeoutExpired: timeout_msg = (f'Evaluation timed out for {model_name} ' f'after 7200 seconds') + if work_dir: + write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir) return False, timeout_msg except Exception as e: - return False, f'Error during evaluation for {model_name}: {str(e)}' + error_msg = f'Error during evaluation for {model_name}: {str(e)}' + if work_dir: + write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir) + return False, error_msg From 71022de17d56a197142b342faf0905e31a678c80 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 18 Sep 2025 19:38:50 +0800 Subject: [PATCH 04/28] CI: update workflow docker --- autotest/evaluate/eval_config_chat.py | 1 + autotest/utils/evaluate_utils.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index 8d55ec232e..549605ac76 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -26,6 +26,7 @@ path=MODEL_PATH, meta_template=api_meta_template, max_out_len=2048, + batch_size=500, temperature=0.1, ) ] diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 9ecc0dc724..57a3275c33 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -8,7 +8,7 @@ DEFAULT_PORT = 23333 -def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None): +def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, work_dir=None): status = '✅ PASS' if result else '❌ FAIL' metrics = {} @@ -39,7 +39,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None): mmlu_value = metrics.get('mmlu', '') gsm8k_value = metrics.get('gsm8k', '') - summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n' + summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n' summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None) if summary_file: @@ -49,17 +49,17 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None): else: with open(summary_file, 'r') as f: first_lines = f.read(200) - if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines: + if '| Model | Backend | TP | Status | mmlu | gsm8k |' not in first_lines: write_header = True with open(summary_file, 'a') as f: if write_header: f.write('## Model Evaluation Results\n') - f.write('| Model | TP | Status | mmlu | gsm8k |\n') - f.write('|-------|----|--------|------|-------|\n') + f.write('| Model | Backend | TP | Status | mmlu | gsm8k |\n') + f.write('|-------|---------|----|--------|------|-------|\n') f.write(summary_line) else: - print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}') + print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}') def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): @@ -181,7 +181,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if error_lines: final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' - write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir) + write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) return final_result, final_msg @@ -193,10 +193,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA timeout_msg = (f'Evaluation timed out for {model_name} ' f'after 7200 seconds') if work_dir: - write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir) + write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir) return False, timeout_msg except Exception as e: error_msg = f'Error during evaluation for {model_name}: {str(e)}' if work_dir: - write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir) + write_to_summary(model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir) return False, error_msg From 88a683672198443f99f95a8c7d61214f343a6a1b Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 19 Sep 2025 14:16:31 +0800 Subject: [PATCH 05/28] TEST: update code based on comments --- .../workflows/{api_eva.yml => api_eval.yml} | 8 +- autotest/config.yaml | 6 +- autotest/evaluate/test_api_evaluate.py | 119 ++++++++++++++++++ .../evaluate/test_api_evaluate_pytorch.py | 90 ------------- .../evaluate/test_api_evaluate_turbomind.py | 90 ------------- autotest/utils/config_utils.py | 4 +- autotest/utils/evaluate_utils.py | 70 ++++++----- 7 files changed, 165 insertions(+), 222 deletions(-) rename .github/workflows/{api_eva.yml => api_eval.yml} (87%) create mode 100644 autotest/evaluate/test_api_evaluate.py delete mode 100644 autotest/evaluate/test_api_evaluate_pytorch.py delete mode 100644 autotest/evaluate/test_api_evaluate_turbomind.py diff --git a/.github/workflows/api_eva.yml b/.github/workflows/api_eval.yml similarity index 87% rename from .github/workflows/api_eva.yml rename to .github/workflows/api_eval.yml index 91e596e648..46393a0ebb 100644 --- a/.github/workflows/api_eva.yml +++ b/.github/workflows/api_eval.yml @@ -1,4 +1,4 @@ -name: api_eva +name: api_eval on: workflow_dispatch: @@ -127,8 +127,10 @@ jobs: run: | overall_exit=0 ln -s /mnt/187/opencompass-data/data ./data - pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? exit $overall_exit - name: Clear workspace if: always() diff --git a/autotest/config.yaml b/autotest/config.yaml index 379fd666bb..ddef407ff6 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -378,12 +378,8 @@ benchmark_model: evaluate_model: - google/gemma-2-9b-it - google/gemma-2-27b-it - - internlm/internlm2_5-7b-chat - - internlm/internlm3-8b-instruct - - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Meta-Llama-3-1-8B-Instruct - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-32B-Instruct - - meta-llama/Llama-2-7b-chat-hf - Qwen/Qwen1.5-MoE-A2.7B-Chat - - internlm/internlm2_5-20b-chat + - Qwen/Qwen3-30B-A3B diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py new file mode 100644 index 0000000000..ffd3edc97e --- /dev/null +++ b/autotest/evaluate/test_api_evaluate.py @@ -0,0 +1,119 @@ +import pytest +from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid +from utils.evaluate_utils import restful_test +from utils.run_restful_chat import start_restful_api, stop_restful_api + +DEFAULT_PORT = 23333 + + +@pytest.fixture(scope='function', autouse=True) +def prepare_environment(request, config, worker_id): + param = request.param + model = param['model'] + backend = param['backend'] + model_path = config.get('model_path') + '/' + model + pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) + yield param + stop_restful_api(pid, startRes, param) + + +def get_turbomind_model_list(tp_num): + model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8]) + new_model_list = [] + for model in model_list: + model['cuda_prefix'] = None + new_model_list.append(model) + return new_model_list + + +def get_pytorch_model_list(tp_num): + model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8]) + new_model_list = [] + for model in model_list: + model['cuda_prefix'] = None + new_model_list.append(model) + return new_model_list + + +def run_test(config, run_id, prepare_environment, worker_id): + if get_workerid(worker_id) is None: + result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) + else: + result, msg = restful_test(config, + run_id, + prepare_environment, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + return result, msg + + +@pytest.mark.turbomind +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=1), indirect=True) +def test_turbomind_restful_tp1(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.turbomind +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=2), indirect=True) +def test_turbomind_restful_tp2(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.turbomind +@pytest.mark.gpu_num_4 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=4), indirect=True) +def test_turbomind_restful_tp4(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.turbomind +@pytest.mark.gpu_num_8 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_turbomind_model_list(tp_num=8), indirect=True) +def test_turbomind_restful_tp8(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.pytorch +@pytest.mark.gpu_num_1 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=1), indirect=True) +def test_pytorch_restful_tp1(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=2), indirect=True) +def test_pytorch_restful_tp2(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.pytorch +@pytest.mark.gpu_num_4 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=4), indirect=True) +def test_pytorch_restful_tp4(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg + + +@pytest.mark.pytorch +@pytest.mark.gpu_num_8 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', get_pytorch_model_list(tp_num=8), indirect=True) +def test_pytorch_restful_tp8(config, run_id, prepare_environment, worker_id): + result, msg = run_test(config, run_id, prepare_environment, worker_id) + assert result, msg diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py deleted file mode 100644 index 79d77bade0..0000000000 --- a/autotest/evaluate/test_api_evaluate_pytorch.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -from utils.config_utils import get_evaluate_pytorch_model_list, get_workerid -from utils.evaluate_utils import restful_test -from utils.run_restful_chat import start_restful_api, stop_restful_api - -DEFAULT_PORT = 23333 - - -@pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config, worker_id): - param = request.param - model = param['model'] - backend = param['backend'] - model_path = config.get('model_path') + '/' + model - pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) - yield param - stop_restful_api(pid, startRes, param) - - -def getModelList(tp_num): - model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8]) - new_model_list = [] - for model in model_list: - model['cuda_prefix'] = None - new_model_list.append(model) - return new_model_list - - -@pytest.mark.gpu_num_1 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) -def test_restful_tp1(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_2 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) -def test_restful_tp2(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_4 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) -def test_restful_tp4(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_8 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True) -def test_restful_tp8(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py deleted file mode 100644 index 38a838ff8b..0000000000 --- a/autotest/evaluate/test_api_evaluate_turbomind.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -from utils.config_utils import get_evaluate_turbomind_model_list, get_workerid -from utils.evaluate_utils import restful_test -from utils.run_restful_chat import start_restful_api, stop_restful_api - -DEFAULT_PORT = 23333 - - -@pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config, worker_id): - param = request.param - model = param['model'] - backend = param['backend'] - model_path = config.get('model_path') + '/' + model - pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id) - yield param - stop_restful_api(pid, startRes, param) - - -def getModelList(tp_num): - model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8]) - new_model_list = [] - for model in model_list: - model['cuda_prefix'] = None - new_model_list.append(model) - return new_model_list - - -@pytest.mark.gpu_num_1 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) -def test_restful_tp1(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_2 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=2), indirect=True) -def test_restful_tp2(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_4 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=4), indirect=True) -def test_restful_tp4(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg - - -@pytest.mark.gpu_num_8 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(tp_num=8), indirect=True) -def test_restful_tp8(config, run_id, prepare_environment, worker_id): - if get_workerid(worker_id) is None: - result, msg = restful_test(config, run_id, prepare_environment, worker_id=worker_id) - else: - result, msg = restful_test(config, - run_id, - prepare_environment, - worker_id=worker_id, - port=DEFAULT_PORT + get_workerid(worker_id)) - - assert result, msg diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index fae0dbb6b9..1cc56403d8 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -255,9 +255,9 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l if len(model_list) > 0: if tp_num > 1: - communicators = ['native', 'nccl'] + communicators = ['cuda-ipc', 'nccl'] else: - communicators = ['native'] + communicators = ['cuda-ipc'] for communicator in communicators: for item in model_list: if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace( diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 57a3275c33..417c55bff7 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -16,23 +16,33 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w if work_dir and os.path.exists(work_dir): try: summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary')) - if summary_dirs: - summary_dir = summary_dirs[0] - csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv')) - if csv_files: - csv_file = sorted(csv_files)[-1] - if os.path.exists(csv_file): - with open(csv_file, 'r') as f: - reader = csv.reader(f) - next(reader) - for row in reader: - if len(row) >= 5 and row[4]: - dataset = row[0] - metric_value = row[4] - try: - metrics[dataset] = f'{float(metric_value):.2f}' - except ValueError: - metrics[dataset] = metric_value + if not summary_dirs: + raise FileNotFoundError('No summary directory found') + + summary_dir = summary_dirs[0] + + csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv')) + if not csv_files: + raise FileNotFoundError('No CSV files found') + + csv_file = sorted(csv_files)[-1] + if not os.path.exists(csv_file): + raise FileNotFoundError('CSV file does not exist') + + with open(csv_file, 'r') as f: + reader = csv.reader(f) + next(reader) + for row in reader: + if len(row) < 5 or not row[4]: + continue + + dataset = row[0] + metric_value = row[4] + try: + metrics[dataset] = f'{float(metric_value):.2f}' + except ValueError: + metrics[dataset] = metric_value + except Exception as e: print(f'Error reading metrics: {str(e)}') @@ -43,15 +53,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None) if summary_file: - write_header = False - if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0: - write_header = True - else: - with open(summary_file, 'r') as f: - first_lines = f.read(200) - if '| Model | Backend | TP | Status | mmlu | gsm8k |' not in first_lines: - write_header = True - + write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0 with open(summary_file, 'a') as f: if write_header: f.write('## Model Evaluation Results\n') @@ -68,9 +70,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA model_name = prepare_environment['model'] backend_type = prepare_environment['backend'] tp_num = prepare_environment.get('tp_num', 1) - communicator = prepare_environment.get('communicator', 'native') + communicator = prepare_environment.get('communicator', 'cuda-ipc') quant_policy = prepare_environment.get('quant_policy', 0) + summary_model_name = model_name + if quant_policy in [4, 8]: + summary_model_name = f'{model_name}-kvint{quant_policy}' + current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) @@ -99,13 +105,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA cfg = Config.fromfile(config_file) - cfg.MODEL_NAME = model_name + cfg.MODEL_NAME = summary_model_name cfg.MODEL_PATH = model_path cfg.API_BASE = f'http://127.0.0.1:{port}/v1' if cfg.models and len(cfg.models) > 0: model_cfg = cfg.models[0] - model_cfg['abbr'] = f'{model_name}-lmdeploy-api' + model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api' model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' model_cfg['path'] = model_path if 'backend' in model_cfg: @@ -181,7 +187,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if error_lines: final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' - write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) return final_result, final_msg @@ -193,10 +199,10 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA timeout_msg = (f'Evaluation timed out for {model_name} ' f'after 7200 seconds') if work_dir: - write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir) return False, timeout_msg except Exception as e: error_msg = f'Error during evaluation for {model_name}: {str(e)}' if work_dir: - write_to_summary(model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir) return False, error_msg From c68f4d2f8a3c807a3b0ac439c623cc9f4256bca5 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Fri, 19 Sep 2025 16:42:19 +0800 Subject: [PATCH 06/28] TEST: update docker --- .github/workflows/api_eval.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index 46393a0ebb..e10b6e01a8 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -79,7 +79,7 @@ jobs: matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip From fd244e72504f28c1cce6797377e1b336a0255fab Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 19 Sep 2025 17:59:44 +0800 Subject: [PATCH 07/28] add H800 base model eval --- .github/scripts/eval_base_config.py | 113 ++++++------ .github/workflows/evaluate.yml | 25 +-- .github/workflows/evaluate_h800.yml | 165 ++++++++++++++++++ .../test_pipeline_chat_turbomind_mllm.py | 2 +- .../test_restful_chat_hf_turbomind_llm.py | 14 +- autotest/utils/config_utils.py | 2 +- autotest/utils/pipeline_chat.py | 4 +- autotest/utils/run_restful_chat.py | 2 +- 8 files changed, 242 insertions(+), 85 deletions(-) create mode 100644 .github/workflows/evaluate_h800.yml diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index 64bbdfd972..7c9d151715 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -39,26 +39,6 @@ wikibench_datasets # noqa: F401, E501 from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ winogrande_datasets # noqa: F401, E501 - from opencompass.configs.models.baichuan.hf_baichuan_7b import models as hf_baichuan_7b # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_7b import models as hf_gemma_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import models as hf_internlm2_5_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm_7b import models as hf_internlm_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm_20b import models as hf_internlm_20b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ - models as lmdeploy_internlm2_5_7b # noqa: F401, E501 - from opencompass.configs.models.hf_llama.hf_llama2_7b import models as hf_llama2_7b # noqa: F401, E501 - from opencompass.configs.models.hf_llama.hf_llama3_8b import models as hf_llama3_8b # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1 # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \ - models as hf_mixtral_8x7b_v0_1 # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import models as lmdeploy_qwen2_5_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as hf_qwen1_5_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.hf_qwen2_7b import models as hf_qwen2_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.hf_qwen_7b import models as hf_qwen_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import models as lmdeploy_qwen1_5_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import models as lmdeploy_qwen2_7b # noqa: F401, E501 # Summary Groups from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501 @@ -69,6 +49,14 @@ # read models race_datasets = [race_datasets[1]] +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', + 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', + 'professional_medicine', 'virology' + ] +] + summarizer = dict( dataset_abbrs=[ ['race-high', 'accuracy'], @@ -138,48 +126,69 @@ summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) -turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b) -turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b) -turbomind_qwen2_5_7b = deepcopy(*lmdeploy_qwen2_5_7b) -turbomind_qwen2_5_14b = deepcopy(*lmdeploy_qwen2_5_7b) -turbomind_qwen2_5_14b['path'] = 'Qwen/Qwen2.5-14B' -turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b) -turbomind_internlm2_5_7b_4bits = deepcopy(*lmdeploy_internlm2_5_7b) -turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b) -turbomind_internlm2_5_7b_batch1_4bits = deepcopy(*lmdeploy_internlm2_5_7b) - base_model = dict( type=TurboMindModel, - engine_config=dict(session_len=7168, max_batch_size=128, tp=1), + engine_config=dict(session_len=7168, tp=1), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), max_seq_len=7168, max_out_len=1024, - batch_size=128, + batch_size=1024, run_cfg=dict(num_gpus=1), ) +turbomind_qwen2_5_1_5b = deepcopy(base_model) +turbomind_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B' +turbomind_qwen2_5_7b = deepcopy(base_model) +turbomind_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B' +turbomind_qwen2_5_32b = deepcopy(base_model) +turbomind_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B' +turbomind_qwen2_5_32b['run_cfg']['num_gpus'] = 2 +turbomind_qwen2_5_32b['engine_config']['tp'] = 2 +turbomind_internlm2_5_7b = deepcopy(base_model) +turbomind_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat' +turbomind_glm_4_9b = deepcopy(base_model) +turbomind_glm_4_9b['path'] = 'THUDM/glm-4-9b' +turbomind_llama_3_70b = deepcopy(base_model) +turbomind_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B' +turbomind_llama_3_70b['run_cfg']['num_gpus'] = 4 +turbomind_llama_3_70b['engine_config']['tp'] = 4 +turbomind_llama_3_1_8b = deepcopy(base_model) +turbomind_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B' +turbomind_qwen3_0_6b_base = deepcopy(base_model) +turbomind_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base' turbomind_qwen3_8b_base = deepcopy(base_model) -pytorch_qwen3_8b_base = deepcopy(base_model) -turbomind_qwen3_8b_base_4bits = deepcopy(base_model) -turbomind_qwen3_8b_base_kvint8 = deepcopy(base_model) -for model in [ - v for k, v in locals().items() - if k.startswith('turbomind_qwen3_8b_base') or k.startswith('pytorch_qwen3_8b_base') -]: - model['abbr'] = 'qwen3_8b_base_turbomind' - model['path'] = 'Qwen/Qwen3-8B-Base' - model['run_cfg']['num_gpus'] = 1 - model['engine_config']['tp'] = 1 +turbomind_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base' +turbomind_qwen3_30b_A3B_base = deepcopy(base_model) +turbomind_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base' +turbomind_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2 +turbomind_qwen3_30b_A3B_base['engine_config']['tp'] = 2 -for model in [v for k, v in locals().items() if k.endswith('_4bits')]: - model['engine_config']['model_format'] = 'awq' - model['abbr'] = model['abbr'] + '_4bits' - model['path'] = model['path'] + '-inner-4bits' - -for model in [v for k, v in locals().items() if '_batch1' in k]: - model['abbr'] = model['abbr'] + '_batch1' - model['engine_config']['max_batch_size'] = 1 - model['batch_size'] = 1 +pytorch_qwen2_5_1_5b = deepcopy(base_model) +pytorch_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B' +pytorch_qwen2_5_7b = deepcopy(base_model) +pytorch_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B' +pytorch_qwen2_5_32b = deepcopy(base_model) +pytorch_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B' +pytorch_qwen2_5_32b['run_cfg']['num_gpus'] = 2 +pytorch_qwen2_5_32b['engine_config']['tp'] = 2 +pytorch_internlm2_5_7b = deepcopy(base_model) +pytorch_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat' +pytorch_gemma_2_9b = deepcopy(base_model) +pytorch_gemma_2_9b['path'] = 'google/gemma-2-9b' +pytorch_llama_3_70b = deepcopy(base_model) +pytorch_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B' +pytorch_llama_3_70b['run_cfg']['num_gpus'] = 4 +pytorch_llama_3_70b['engine_config']['tp'] = 4 +pytorch_llama_3_1_8b = deepcopy(base_model) +pytorch_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B' +pytorch_qwen3_0_6b_base = deepcopy(base_model) +pytorch_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base' +pytorch_qwen3_8b_base = deepcopy(base_model) +pytorch_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base' +pytorch_qwen3_30b_A3B_base = deepcopy(base_model) +pytorch_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base' +pytorch_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2 +pytorch_qwen3_30b_A3B_base['engine_config']['tp'] = 2 for model in [v for k, v in locals().items() if k.startswith('pytorch_')]: model['abbr'] = model['abbr'].replace('turbomind', 'pytorch') diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index be64e8743f..bfb0840b34 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -13,26 +13,16 @@ on: description: 'Set branch or tag or commit id. Default is "main"' type: string default: 'main' - chat_models: - required: true - description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]' - type: string - default: '[turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_qwen2_5_32b_instruct, pytorch_qwen2_5_32b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, pytorch_internlm2_5_7b_chat_w8a8, turbomind_internlm3_8b_instruct_4bits, turbomind_internlm3_8b_instruct_kvint4, turbomind_internlm3_8b_instruct_kvint8, pytorch_internlm3_8b_instruct_w8a8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8,turbomind_llama3_8b_instruct_kvint8, pytorch_llama3_1_8b_instruct_w8a8, turbomind_qwen2_7b_instruct_kvint8, turbomind_qwen2_5_7b_instruct_4bits, turbomind_qwen2_5_7b_instruct_kvint8, pytorch_qwen2_5_7b_instruct_w8a8, turbomind_qwen2_5_32b_instruct_4bits, turbomind_qwen2_5_32b_instruct_kvint8,turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8]' - chat_datasets: - required: true - description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]' - type: string - default: '[*mmlu_datasets, *gsm8k_datasets, *ifeval_datasets]' base_models: required: true - description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_internlm2_5_7b_batch1, turbomind_internlm2_5_7b_batch1_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]' + description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]' type: string - default: '[turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]' + default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]' baes_datasets: required: true description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]' type: string - default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]' + default: '[*mmlu_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]' oc_repo_org: required: false description: 'Tested repository organization name. Default is open-compass/opencompass' @@ -96,7 +86,7 @@ jobs: strategy: fail-fast: false matrix: - evaluate_type: ['chat', 'base'] + evaluate_type: ['base'] container: image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -158,13 +148,6 @@ jobs: run: | ln -s /root/opencompass-data ./data python3 .github/scripts/action_tools.py create_model_links /root/models . - - name: Evaluate chat models - if: matrix.evaluate_type == 'chat' - run: | - echo ${{github.event.inputs.chat_models}} - echo ${{github.event.inputs.chat_datasets}} - export LMDEPLOY_DIR=$(pwd) - python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.chat_models}}" "${{github.event.inputs.chat_datasets}}" /root/evaluation-reports/${{ github.run_id }} chat - name: Evaluate base models if: matrix.evaluate_type == 'base' run: | diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml new file mode 100644 index 0000000000..85af1b53d8 --- /dev/null +++ b/.github/workflows/evaluate_h800.yml @@ -0,0 +1,165 @@ +name: evaluate + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM/lmdeploy' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + base_models: + required: true + description: 'Tested TurboMind models list. eg. [turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' + type: string + default: '[turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' + baes_datasets: + required: true + description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]' + type: string + default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]' + oc_repo_org: + required: false + description: 'Tested repository organization name. Default is open-compass/opencompass' + type: string + default: 'open-compass/opencompass' + oc_repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + +env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + +jobs: + linux-build: + if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.4 + OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + evaluate: + needs: linux-build + if: ${{github.event_name == 'schedule' || !cancelled()}} + runs-on: [self-hosted, linux-eval] + timeout-minutes: 4320 # 72hours + strategy: + fail-fast: false + matrix: + evaluate_type: ['base'] + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/root/models + - /mnt/187:/mnt/187 + - /mnt/bigdisk:/mnt/bigdisk + - /mnt/shared:/mnt/shared + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Setup systems + run: | + export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')" + echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: cp -r /root/models/offline_pkg/lmdeploy/. . + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r /root/models/offline_pkg/requirements.txt + - name: Install lmdeploy + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install lmdeploy - offline + if: ${{inputs.offline_mode}} + run: | + python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git + cd opencompass + git checkout ${{ github.event.inputs.oc_repo_ref}} + python3 -m pip install . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + python3 -m pip list + lmdeploy check_env + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /root/models . + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + echo ${{github.event.inputs.base_models}} + echo ${{github.event.inputs.baes_datasets}} + export LMDEPLOY_DIR=$(pwd) + python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation-reports/${{ github.run_id }} base + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index bcfd071eba..da7e255a8e 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -32,7 +32,7 @@ def test_pipeline_chat_tp2(config, model, communicator, worker_id): set_device_env_variable(worker_id, tp_num=2) os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) if ('MiniCPM-V-2_6' in model or 'InternVL2_5-26B' in model or 'InternVL2-26B' in model - or 'InternVL3-38B' in model) and communicator == 'native': + or 'InternVL3-38B' in model) and communicator == 'cuda-ipc': return run_pipeline_vl_chat_test(config, model, BACKEND, worker_id, {'communicator': communicator}) diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index daf2664662..33c5b4ba3c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -245,25 +245,25 @@ def test_restful_chat_fallback_backend_tp1(config, common_case_config, worker_id 'model': 'google/gemma-2-27b-it', 'cuda_prefix': None, 'tp_num': 2, - 'extra': ' --communicator native' + 'extra': ' --communicator cuda-ipc' }, { 'model': 'deepseek-ai/deepseek-moe-16b-chat', 'cuda_prefix': None, 'tp_num': 2, - 'extra': ' --communicator native' + 'extra': ' --communicator cuda-ipc' }, { 'model': 'google/gemma-2-27b-it', 'cuda_prefix': None, 'tp_num': 2, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8 --communicator cuda-ipc' }, { 'model': 'deepseek-ai/deepseek-moe-16b-chat', 'cuda_prefix': None, 'tp_num': 2, - 'extra': ' --quant-policy 8 --communicator native' + 'extra': ' --quant-policy 8 --communicator cuda-ipc' }, ], indirect=True) @@ -301,19 +301,19 @@ def test_restful_chat_fallback_backend_tp2(config, common_case_config, worker_id 'model': 'internlm/internlm2_5-20b-chat', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2, - 'extra': ' --communicator native' + 'extra': ' --communicator cuda-ipc' }, { 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2, - 'extra': ' --communicator native' + 'extra': ' --communicator cuda-ipc' }, { 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2, - 'extra': ' --communicator native' + 'extra': ' --communicator cuda-ipc' }, ], indirect=True) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 51de106840..ca0f969c2d 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -85,7 +85,7 @@ def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: def get_communicator_list(tp_num: int = None): if tp_num != 1 and _is_bf16_supported_by_device(): - return ['native', 'nccl'] + return ['cuda-ipc', 'nccl'] return ['nccl'] diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index fabc074d37..2a8349b572 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -23,7 +23,7 @@ def run_pipeline_chat_test(config, # temp remove testcase because of issue 3434 if ('InternVL3' in model_case or 'InternVL2_5' in model_case or 'MiniCPM-V-2_6' in model_case ) and 'turbomind' in backend_type and extra is not None and 'communicator' in extra and extra.get( - 'communicator') == 'native' and tp > 1: + 'communicator') == 'cuda-ipc' and tp > 1: return model_name = model_name = get_model_name(model_case) model_path = config.get('model_path') @@ -104,7 +104,7 @@ def run_pipeline_vl_chat_test(config, if ('InternVL3' in model_case or 'InternVL2_5' in model_case or 'MiniCPM-V-2_6' in model_case ) and 'turbomind' in backend_type and extra is not None and 'communicator' in extra and extra.get( - 'communicator') == 'native' and tp > 1: + 'communicator') == 'cuda-ipc' and tp > 1: return pipeline_chat_log = os.path.join( diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 88c9468823..eca66f2fc0 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -31,7 +31,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) # temp remove testcase because of issue 3434 if ('InternVL3' in model or 'InternVL2_5' in model or 'MiniCPM-V-2_6' in model): - if 'turbomind' in backend_type and extra is not None and 'communicator native' in extra and tp_num > 1: + if 'turbomind' in backend_type and extra is not None and 'cuda-ipc' in extra and tp_num > 1: return if 'modelscope' in param.keys(): From cf1ddccd17cf73f61da3699c332cfdc26e26e617 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 19 Sep 2025 18:19:27 +0800 Subject: [PATCH 08/28] update --- .github/workflows/api_eval_h800.yml | 2 +- .github/workflows/evaluate.yml | 2 +- .github/workflows/evaluate_h800.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 16b9ad07dc..f655b459a6 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -73,7 +73,7 @@ jobs: test_evaluation: needs: linux-build if: ${{ !cancelled() }} - runs-on: [self-hosted, test-140] + runs-on: [self-hosted, h800-r1] timeout-minutes: 2400 strategy: fail-fast: false diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index bfb0840b34..d5c38605dc 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -81,7 +81,7 @@ jobs: evaluate: needs: linux-build if: ${{github.event_name == 'schedule' || !cancelled()}} - runs-on: [self-hosted, linux-eval] + runs-on: [self-hosted, test-140] timeout-minutes: 4320 # 72hours strategy: fail-fast: false diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index 8b8544e5da..4d5cb73796 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -81,7 +81,7 @@ jobs: evaluate: needs: linux-build if: ${{github.event_name == 'schedule' || !cancelled()}} - runs-on: [self-hosted, linux-eval] + runs-on: [self-hosted, h800-r1] timeout-minutes: 4320 # 72hours strategy: fail-fast: false From b50e1f5c5c77b227f28160bdd193a2be370ea370 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 19 Sep 2025 19:17:30 +0800 Subject: [PATCH 09/28] update --- .github/workflows/api_eval_h800.yml | 2 +- .github/workflows/evaluate_h800.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index f655b459a6..496e0da25d 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -80,7 +80,7 @@ jobs: matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index 4d5cb73796..e54e939c53 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -88,7 +88,7 @@ jobs: matrix: evaluate_type: ['base'] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip From 795132580a4c48a9df7d8e02b596c6ae03b3aa57 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Sun, 21 Sep 2025 00:21:39 +0800 Subject: [PATCH 10/28] Update eval_base_config.py --- .github/scripts/eval_base_config.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index 7c9d151715..d8dc388c7d 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -138,58 +138,77 @@ turbomind_qwen2_5_1_5b = deepcopy(base_model) turbomind_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B' +turbomind_qwen2_5_1_5b['abbr'] = 'turbomind_qwen2_5_1_5b' turbomind_qwen2_5_7b = deepcopy(base_model) turbomind_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B' +turbomind_qwen2_5_7b['abbr'] = 'turbomind_qwen2_5_7b' turbomind_qwen2_5_32b = deepcopy(base_model) turbomind_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B' +turbomind_qwen2_5_32b['abbr'] = 'turbomind_qwen2_5_32b' turbomind_qwen2_5_32b['run_cfg']['num_gpus'] = 2 turbomind_qwen2_5_32b['engine_config']['tp'] = 2 turbomind_internlm2_5_7b = deepcopy(base_model) turbomind_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat' +turbomind_internlm2_5_7b['abbr'] = 'turbomind_internlm2_5_7b' turbomind_glm_4_9b = deepcopy(base_model) turbomind_glm_4_9b['path'] = 'THUDM/glm-4-9b' +turbomind_glm_4_9b['abbr'] = 'turbomind_glm_4_9b' turbomind_llama_3_70b = deepcopy(base_model) turbomind_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B' +turbomind_llama_3_70b['abbr'] = 'turbomind_llama_3_70b' turbomind_llama_3_70b['run_cfg']['num_gpus'] = 4 turbomind_llama_3_70b['engine_config']['tp'] = 4 turbomind_llama_3_1_8b = deepcopy(base_model) turbomind_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B' +turbomind_llama_3_1_8b['abbr'] = 'turbomind_llama_3_1_8b' turbomind_qwen3_0_6b_base = deepcopy(base_model) turbomind_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base' +turbomind_qwen3_0_6b_base['abbr'] = 'turbomind_qwen3_0_6b_base' turbomind_qwen3_8b_base = deepcopy(base_model) turbomind_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base' +turbomind_qwen3_8b_base['abbr'] = 'turbomind_qwen3_8b_base' turbomind_qwen3_30b_A3B_base = deepcopy(base_model) turbomind_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base' +turbomind_qwen3_30b_A3B_base['abbr'] = 'turbomind_qwen3_30b_A3B_base' turbomind_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2 turbomind_qwen3_30b_A3B_base['engine_config']['tp'] = 2 pytorch_qwen2_5_1_5b = deepcopy(base_model) pytorch_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B' +pytorch_qwen2_5_1_5b['abbr'] = 'pytorch_qwen2_5_1_5b' pytorch_qwen2_5_7b = deepcopy(base_model) pytorch_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B' +pytorch_qwen2_5_7b['abbr'] = 'pytorch_qwen2_5_7b' pytorch_qwen2_5_32b = deepcopy(base_model) pytorch_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B' +pytorch_qwen2_5_32b['abbr'] = 'pytorch_qwen2_5_32b' pytorch_qwen2_5_32b['run_cfg']['num_gpus'] = 2 pytorch_qwen2_5_32b['engine_config']['tp'] = 2 pytorch_internlm2_5_7b = deepcopy(base_model) pytorch_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat' +pytorch_internlm2_5_7b['abbr'] = 'pytorch_internlm2_5_7b' pytorch_gemma_2_9b = deepcopy(base_model) pytorch_gemma_2_9b['path'] = 'google/gemma-2-9b' +pytorch_gemma_2_9b['abbr'] = 'pytorch_gemma_2_9b' pytorch_llama_3_70b = deepcopy(base_model) pytorch_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B' +pytorch_llama_3_70b['abbr'] = 'pytorch_llama_3_70b' pytorch_llama_3_70b['run_cfg']['num_gpus'] = 4 pytorch_llama_3_70b['engine_config']['tp'] = 4 pytorch_llama_3_1_8b = deepcopy(base_model) pytorch_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B' +pytorch_llama_3_1_8b['abbr'] = 'pytorch_llama_3_1_8b' pytorch_qwen3_0_6b_base = deepcopy(base_model) pytorch_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base' +pytorch_qwen3_0_6b_base['abbr'] = 'pytorch_qwen3_0_6b_base' pytorch_qwen3_8b_base = deepcopy(base_model) pytorch_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base' +pytorch_qwen3_8b_base['abbr'] = 'pytorch_qwen3_8b_base' pytorch_qwen3_30b_A3B_base = deepcopy(base_model) pytorch_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base' +pytorch_qwen3_30b_A3B_base['abbr'] = 'pytorch_qwen3_30b_A3B_base' pytorch_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2 pytorch_qwen3_30b_A3B_base['engine_config']['tp'] = 2 for model in [v for k, v in locals().items() if k.startswith('pytorch_')]: - model['abbr'] = model['abbr'].replace('turbomind', 'pytorch') model['backend'] = 'pytorch' From a4a903bec4c862974f0044d983cb045240955f76 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Sun, 21 Sep 2025 00:49:44 +0800 Subject: [PATCH 11/28] update --- .github/workflows/api_eval_h800.yml | 2 +- .github/workflows/daily_ete_test_h800.yml | 2 -- autotest/config-h800.yaml | 1 + autotest/config.yaml | 1 + 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 496e0da25d..b7cc491d59 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -32,7 +32,6 @@ env: OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL - DEVICE: h800 jobs: linux-build: @@ -115,6 +114,7 @@ jobs: run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + mv autotest/config-h800.yaml autotest/config.yaml - name: Install opencompass run: | python3 -m pip install opencompass diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml index 1dab90bebf..9f1db0dce8 100644 --- a/.github/workflows/daily_ete_test_h800.yml +++ b/.github/workflows/daily_ete_test_h800.yml @@ -179,7 +179,6 @@ jobs: lmdeploy check_env rm -rf allure-results # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test lmdeploy - chat @@ -270,7 +269,6 @@ jobs: lmdeploy check_env rm -rf allure-results # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Start restful api diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index 553039938d..d2ed946ac2 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models resource_path: /nvme/qa_test_models/resource dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log +eval_log_path: /nvme/qa_test_models/evaluation_report benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json env_tag: h800 diff --git a/autotest/config.yaml b/autotest/config.yaml index 4d82aabc81..8973f21fd5 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models resource_path: /nvme/qa_test_models/resource dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log +eval_log_path: /nvme/qa_test_models/evaluation_report benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json env_tag: a100 From cbed0dcf58216bd59466156888481a6431f523bc Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 09:53:28 +0800 Subject: [PATCH 12/28] update --- autotest/evaluate/eval_config_chat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index 549605ac76..ea37f858cf 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -6,6 +6,14 @@ from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501 +mmlu_datasets = [ + x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [ + 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', + 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', + 'professional_medicine', 'virology' + ] +] + datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) MODEL_NAME = '' @@ -35,6 +43,7 @@ dataset_abbrs=[ ['mmlu', 'naive_average'], ['gsm8k', 'accuracy'], + 'mmlu-other', ], summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) From a72ea20510beb8fdc519298312815579ebbcbda7 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 11:21:27 +0800 Subject: [PATCH 13/28] update max_out_len --- autotest/evaluate/eval_config_chat.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index ea37f858cf..122f4e1a94 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -1,5 +1,6 @@ from mmengine.config import read_base from opencompass.models import OpenAISDK +from opencompass.utils.text_postprocessors import extract_non_reasoning_content with read_base(): from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501 @@ -26,17 +27,16 @@ ]) models = [ - dict( - type=OpenAISDK, - abbr=f'{MODEL_NAME}-lmdeploy-api', - openai_api_base=API_BASE, - key='EMPTY', - path=MODEL_PATH, - meta_template=api_meta_template, - max_out_len=2048, - batch_size=500, - temperature=0.1, - ) + dict(type=OpenAISDK, + abbr=f'{MODEL_NAME}-lmdeploy-api', + openai_api_base=API_BASE, + key='EMPTY', + path=MODEL_PATH, + meta_template=api_meta_template, + max_out_len=32768, + batch_size=500, + temperature=0.1, + pred_postprocessor=dict(type=extract_non_reasoning_content)) ] summarizer = dict( From a42ac7ee2adae7c1bc43f51b8be7514e0b802252 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 11:48:23 +0800 Subject: [PATCH 14/28] set oc data path --- .github/workflows/api_eval.yml | 2 +- .github/workflows/api_eval_h800.yml | 1 + .github/workflows/evaluate.yml | 2 +- .github/workflows/evaluate_h800.yml | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index e10b6e01a8..9e3e327f5a 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -32,6 +32,7 @@ env: OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache jobs: linux-build: @@ -85,7 +86,6 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/nvme/qa_test_models - /mnt/shared:/mnt/shared diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index b7cc491d59..0d3518ffde 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -32,6 +32,7 @@ env: OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache jobs: linux-build: diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index d5c38605dc..4395ee69b4 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -41,6 +41,7 @@ on: env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache jobs: linux-build: @@ -94,7 +95,6 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/root/models - /mnt/187:/mnt/187 diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index e54e939c53..afea663963 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -41,6 +41,7 @@ on: env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache jobs: linux-build: @@ -94,7 +95,6 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/root/models - /nvme/qa_test_models:/nvme/qa_test_models From 94f8b85810b3e9751b397f9657cbdd5dffbf298c Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 12:57:21 +0800 Subject: [PATCH 15/28] update --- autotest/utils/evaluate_utils.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 417c55bff7..82dccb92f6 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -39,7 +39,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w dataset = row[0] metric_value = row[4] try: - metrics[dataset] = f'{float(metric_value):.2f}' + metrics[dataset] = f'{float(metric_value):.2f}' # noqa: E231 except ValueError: metrics[dataset] = metric_value @@ -47,6 +47,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w print(f'Error reading metrics: {str(e)}') mmlu_value = metrics.get('mmlu', '') + mmlu_value = metrics.get('mmlu-other', '') gsm8k_value = metrics.get('gsm8k', '') summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n' @@ -57,8 +58,8 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w with open(summary_file, 'a') as f: if write_header: f.write('## Model Evaluation Results\n') - f.write('| Model | Backend | TP | Status | mmlu | gsm8k |\n') - f.write('|-------|---------|----|--------|------|-------|\n') + f.write('| Model | Backend | TP | Status | mmlu | mmlu-other | gsm8k |\n') + f.write('|-------|---------|----|--------|------|------------|-------|\n') f.write(summary_line) else: print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}') @@ -101,18 +102,18 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA try: if not os.path.exists(config_file): - return False, f'Config file {config_file} not found in any expected location' + return False, f'Config file {config_file} not found' cfg = Config.fromfile(config_file) cfg.MODEL_NAME = summary_model_name cfg.MODEL_PATH = model_path - cfg.API_BASE = f'http://127.0.0.1:{port}/v1' + cfg.API_BASE = f'http://127.0.0.1:{port}/v1' # noqa: E231 if cfg.models and len(cfg.models) > 0: model_cfg = cfg.models[0] model_cfg['abbr'] = f'{summary_model_name}-lmdeploy-api' - model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' + model_cfg['openai_api_base'] = f'http://127.0.0.1:{port}/v1' # noqa: E231 model_cfg['path'] = model_path if 'backend' in model_cfg: model_cfg['backend'] = backend_type @@ -120,7 +121,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']: model_cfg['engine_config']['communicator'] = communicator - temp_config_file = f'temp_{model_name.replace("/", "_")}_{os.getpid()}.py' + temp_config_file = f'temp_{model_name.replace('/', '_')}_{os.getpid()}.py' temp_config_path = os.path.join(log_path, temp_config_file) cfg.dump(temp_config_path) @@ -147,16 +148,16 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA f.write(f'Config file: {temp_config_file}\n') f.write(f'Backend: {backend_type}\n') f.write(f'TP Num: {tp_num}\n') - f.write(f'Command: {" ".join(cmd)}\n') + f.write(f'Command: {' '.join(cmd)}\n') f.write(f'Work directory: {work_dir}\n') - f.write(f'STDOUT:\n{stdout_output}\n') + f.write(f'STDOUT: \n{stdout_output}\n') if stderr_output: - f.write(f'STDERR:\n{stderr_output}\n') + f.write(f'STDERR: \n{stderr_output}\n') f.write(f'Return code: {result.returncode}\n') - print(f'STDOUT:\n{stdout_output}') + print(f'STDOUT: \n{stdout_output}') if stderr_output: - print(f'STDERR:\n{stderr_output}') + print(f'STDERR: \n{stderr_output}') print(f'Return code: {result.returncode}') evaluation_failed = False @@ -185,7 +186,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if any(keyword in line for keyword in error_keywords): error_lines.append(line) if error_lines: - final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}' + final_msg += f'\nLog errors: {' | '.join(error_lines[:3])}' write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) From bb37a846f7ff3635fd6d7e57589e93c4fb63585b Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 13:19:30 +0800 Subject: [PATCH 16/28] update --- autotest/utils/evaluate_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 82dccb92f6..1c68b11707 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -121,7 +121,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if 'engine_config' in model_cfg and 'communicator' in model_cfg['engine_config']: model_cfg['engine_config']['communicator'] = communicator - temp_config_file = f'temp_{model_name.replace('/', '_')}_{os.getpid()}.py' + simple_model_name = model_name.replace('/', '_') + temp_config_file = f'temp_{simple_model_name}_{os.getpid()}.py' temp_config_path = os.path.join(log_path, temp_config_file) cfg.dump(temp_config_path) @@ -142,13 +143,14 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA f'{worker_id}_' f'{quant_policy}.log') log_file = os.path.join(log_path, log_filename) + cmd_command = ' '.join(cmd) with open(log_file, 'w', encoding='utf-8') as f: f.write(f'Model: {model_name}\n') f.write(f'Config file: {temp_config_file}\n') f.write(f'Backend: {backend_type}\n') f.write(f'TP Num: {tp_num}\n') - f.write(f'Command: {' '.join(cmd)}\n') + f.write(f'Command: {cmd_command}\n') f.write(f'Work directory: {work_dir}\n') f.write(f'STDOUT: \n{stdout_output}\n') if stderr_output: From 440a833239cf0853ca1b8e35539ee77cbad53132 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 13:38:09 +0800 Subject: [PATCH 17/28] update --- autotest/utils/evaluate_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 1c68b11707..ef19b753fd 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -188,7 +188,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA if any(keyword in line for keyword in error_keywords): error_lines.append(line) if error_lines: - final_msg += f'\nLog errors: {' | '.join(error_lines[:3])}' + error_lines = ' | '.join(error_lines[:3]) + final_msg += f'\nLog errors: {error_lines}' write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) From db5ac5099a47cd9f66cb043aa01c8bd1a0c30074 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 15:31:49 +0800 Subject: [PATCH 18/28] update --- .github/workflows/api_eval.yml | 1 + .github/workflows/api_eval_h800.yml | 1 + autotest/config-h800.yaml | 6 +++++- autotest/utils/evaluate_utils.py | 19 +++++++++++-------- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index 9e3e327f5a..0adc782557 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -108,6 +108,7 @@ jobs: - name: Install lmdeploy - dependency run: | python3 -m pip install -r requirements_cuda.txt + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | python3 -m pip install lmdeploy-*.whl --no-deps diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 0d3518ffde..72fa84b97a 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -111,6 +111,7 @@ jobs: - name: Install lmdeploy - dependency run: | python3 -m pip install -r requirements_cuda.txt + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | python3 -m pip install lmdeploy-*.whl --no-deps diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index d2ed946ac2..f8a7cd2751 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -10,7 +10,7 @@ env_tag: h800 tp_config: Intern-S1: 8 Qwen3-235B-A22B: 8 - Qwen3-235B-A22B-FP8: 8 + Qwen3-235B-A22B-FP8: 4 Qwen3-30B-A3B: 2 Qwen3-32B: 2 gpt-oss-120b: 2 @@ -131,8 +131,12 @@ evaluate_model: - Qwen/Qwen3-4B-FP8 - Qwen/Qwen3-8B-FP8 - Qwen/Qwen3-14B-FP8 + - Qwen/Qwen3-32B + - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-235B-A22B + - Qwen/Qwen3-235B-A22B-FP8 - openai/gpt-oss-120b - openai/gpt-oss-20b - unsloth/gpt-oss-120b-BF16 diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index ef19b753fd..7a7ae5d204 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -46,23 +46,26 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w except Exception as e: print(f'Error reading metrics: {str(e)}') - mmlu_value = metrics.get('mmlu', '') - mmlu_value = metrics.get('mmlu-other', '') - gsm8k_value = metrics.get('gsm8k', '') + dataset_name = [] + dataset_metrics = [] + for key in metrics.keys(): + dataset_name.append(key) + dataset_metrics.append(metrics.get(key, '')) - summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n' + summary_dataset_name = ' | '.join(dataset_name) + summary_dataset_metrics = ' | '.join(dataset_metrics) summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None) + summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n' if summary_file: write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0 with open(summary_file, 'a') as f: if write_header: f.write('## Model Evaluation Results\n') - f.write('| Model | Backend | TP | Status | mmlu | mmlu-other | gsm8k |\n') - f.write('|-------|---------|----|--------|------|------------|-------|\n') + f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n') f.write(summary_line) else: - print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}') + print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}') def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): @@ -91,7 +94,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA print(f'Backend: {backend_type}') print(f'Config file: {config_file}') - log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log') + log_path = config.get('eval_log_path', '/nvme/qa_test_models/autotest_model/log') os.makedirs(log_path, exist_ok=True) original_cwd = os.getcwd() From 7dc54bd3108258758de4e7582808782465746efa Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 15:36:01 +0800 Subject: [PATCH 19/28] update --- autotest/evaluate/eval_config_chat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index 122f4e1a94..34bd4300c1 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -47,3 +47,7 @@ ], summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) + +for item in datasets: + if 'max_out_len' in item['infer_cfg']['inferencer']: + del item['infer_cfg']['inferencer']['max_out_len'] From 8df42da302c6576b21e5fee03f38485e2b90c1d4 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 18:45:21 +0800 Subject: [PATCH 20/28] update --- autotest/utils/evaluate_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 7a7ae5d204..481344f512 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -61,8 +61,10 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0 with open(summary_file, 'a') as f: if write_header: + dash_line = '-----|' * (len(metrics.keys())) f.write('## Model Evaluation Results\n') f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n') + f.write(f'|-------|---------|----|--------|{dash_line}\n') f.write(summary_line) else: print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}') @@ -94,7 +96,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA print(f'Backend: {backend_type}') print(f'Config file: {config_file}') - log_path = config.get('eval_log_path', '/nvme/qa_test_models/autotest_model/log') + log_path = config.get('eval_log_path', '/nvme/qa_test_models/evaluation_report') + f'/{run_id}' os.makedirs(log_path, exist_ok=True) original_cwd = os.getcwd() From 506c0e37d836af0010a3ee98d21a47ffe4bfbefe Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Mon, 22 Sep 2025 18:58:04 +0800 Subject: [PATCH 21/28] update --- autotest/config.yaml | 12 ++++++------ autotest/utils/run_restful_chat.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 8973f21fd5..87c428fb18 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -36,8 +36,8 @@ tp_config: MiniCPM-V-2_6: 2 gemma-2-27b-it: 2 InternVL2-Llama3-76B-AWQ: 4 - gpt-oss-20b-bf16: 2 - gpt-oss-120b-bf16: 4 + gpt-oss-20b-BF16: 2 + gpt-oss-120b-BF16: 4 @@ -143,8 +143,8 @@ pytorch_chat_model: - Qwen/Qwen2.5-VL-32B-Instruct - Qwen/Qwen2-VL-2B-Instruct - Qwen/Qwen2-VL-7B-Instruct - - lmsys/gpt-oss-20b-bf16 - - lmsys/gpt-oss-120b-bf16 + - unsloth/gpt-oss-20b-BF16 + - unsloth/gpt-oss-120b-BF16 - mistralai/Mistral-7B-Instruct-v0.3 - mistralai/Mixtral-8x7B-Instruct-v0.1 - google/gemma-3-12b-it @@ -372,8 +372,8 @@ benchmark_model: - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-72B-Instruct - deepseek-ai/DeepSeek-V2-Lite-Chat - - lmsys/gpt-oss-20b-bf16 - - lmsys/gpt-oss-120b-bf16 + - unsloth/gpt-oss-20b-BF16 + - unsloth/gpt-oss-120b-BF16 evaluate_model: diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index f499ca5df5..876fd295e2 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -98,8 +98,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) start_timeout = 300 - if not _is_bf16_supported_by_device(): - start_timeout = 600 + if not _is_bf16_supported_by_device() or tp_num >= 4: + start_timeout = 720 sleep(5) for i in range(start_timeout): From d16448a670eb2c154c5ac83e22941930ba7a8d27 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 23 Sep 2025 15:10:09 +0800 Subject: [PATCH 22/28] update --- .github/workflows/evaluate.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 4395ee69b4..9079d1fa68 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -98,6 +98,7 @@ jobs: - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/root/models - /mnt/187:/mnt/187 + - /mnt/140:/mnt/140 - /mnt/bigdisk:/mnt/bigdisk - /mnt/shared:/mnt/shared - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro From aa6a8b88333f40592544e01a80c01ca6ec07ddc5 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Tue, 23 Sep 2025 21:45:13 +0800 Subject: [PATCH 23/28] Update evaluate_h800.yml --- .github/workflows/evaluate_h800.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml index afea663963..07e66f9dc3 100644 --- a/.github/workflows/evaluate_h800.yml +++ b/.github/workflows/evaluate_h800.yml @@ -95,8 +95,6 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - - /nvme/qa_test_models:/root/models - /nvme/qa_test_models:/nvme/qa_test_models - /nvme1/qa_test_models:/nvme1/qa_test_models - /nvme2/share:/nvme2/share @@ -125,7 +123,7 @@ jobs: name: my-artifact-${{ github.run_id }}-py310 - name: Install lmdeploy - dependency run: | - python3 -m pip install -r /root/models/offline_pkg/requirements.txt + python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt - name: Install lmdeploy if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | @@ -149,8 +147,8 @@ jobs: lmdeploy check_env - name: Setup paths for evaluation run: | - ln -s /root/opencompass-data ./data - python3 .github/scripts/action_tools.py create_model_links /root/models . + ln -s /nvme/qa_test_models/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . - name: Evaluate base models if: matrix.evaluate_type == 'base' run: | From a245005cef181c0bf76728776ff1fddd267bc102 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 24 Sep 2025 19:48:30 +0800 Subject: [PATCH 24/28] update --- .github/scripts/eval_base_config.py | 2 +- autotest/config-h800.yaml | 6 ++++++ .../tools/pipeline/test_pipeline_chat_turbomind_mllm.py | 1 + autotest/utils/evaluate_utils.py | 6 +++--- autotest/utils/pipeline_chat.py | 8 ++++---- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index d8dc388c7d..347c032464 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -132,7 +132,7 @@ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), max_seq_len=7168, max_out_len=1024, - batch_size=1024, + batch_size=64, run_cfg=dict(num_gpus=1), ) diff --git a/autotest/config-h800.yaml b/autotest/config-h800.yaml index f8a7cd2751..6e456fdbd9 100644 --- a/autotest/config-h800.yaml +++ b/autotest/config-h800.yaml @@ -89,6 +89,8 @@ turbomind_quatization: gptq: - empty no_kvint4: + - internlm/Intern-S1 + - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -100,6 +102,8 @@ turbomind_quatization: - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 + - openai/gpt-oss-120b + - openai/gpt-oss-20b no_kvint8: - empty @@ -109,6 +113,8 @@ pytorch_quatization: w8a8: - empty no_kvint4: + - internlm/Intern-S1 + - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index da7e255a8e..3323041d1c 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -168,6 +168,7 @@ def test_pipeline_chat_fallback_backend_kvint8_tp1(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 +@pytest.mark.other @pytest.mark.parametrize('model', ['meta-llama/Llama-3.2-11B-Vision-Instruct']) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_fallback_backend_kvint8_tp2(config, model, communicator, worker_id): diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 481344f512..79444e97f9 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -48,7 +48,7 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w dataset_name = [] dataset_metrics = [] - for key in metrics.keys(): + for key in sorted(metrics.keys()): dataset_name.append(key) dataset_metrics.append(metrics.get(key, '')) @@ -100,8 +100,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA os.makedirs(log_path, exist_ok=True) original_cwd = os.getcwd() - work_dir = os.path.join( - log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}") + work_dir = os.path.join(log_path, + f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{quant_policy}") os.makedirs(work_dir, exist_ok=True) try: diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 2a8349b572..5cd35ffd46 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -58,9 +58,9 @@ def run_pipeline_chat_test(config, text=True, encoding='utf-8', env=env, - timeout=600) + timeout=900) except subprocess.TimeoutExpired as e: - assert False, f'Test command timed out after 10 minutes: {e.cmd}' + assert False, f'Test command timed out after 15 minutes: {e.cmd}' output_text = response.stdout print(output_text) @@ -133,9 +133,9 @@ def run_pipeline_vl_chat_test(config, text=True, encoding='utf-8', env=env, - timeout=600) + timeout=900) except subprocess.TimeoutExpired as e: - assert False, f'Test command timed out after 10 minutes: {e.cmd}' + assert False, f'Test command timed out after 15 minutes: {e.cmd}' output_text = response.stdout print(output_text) From d72ef372388fe551a6ad86b943a8f66057235848 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 24 Sep 2025 23:26:07 +0800 Subject: [PATCH 25/28] Update eval_base_config.py --- .github/scripts/eval_base_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index 347c032464..25e374639d 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -132,7 +132,7 @@ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), max_seq_len=7168, max_out_len=1024, - batch_size=64, + batch_size=32, run_cfg=dict(num_gpus=1), ) From 7cca1ff4aa6680dad54e46dd83cf782956cecd43 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 25 Sep 2025 14:24:59 +0800 Subject: [PATCH 26/28] update --- .github/workflows/api_eval.yml | 8 ++++---- .github/workflows/api_eval_h800.yml | 8 ++++---- autotest/conftest.py | 4 ---- autotest/utils/evaluate_utils.py | 21 +++++++++++++-------- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index 0adc782557..5e5d49be36 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -128,10 +128,10 @@ jobs: run: | overall_exit=0 ln -s /mnt/187/opencompass-data/data ./data - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? exit $overall_exit - name: Clear workspace if: always() diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 72fa84b97a..906d3da1a6 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -132,10 +132,10 @@ jobs: run: | overall_exit=0 ln -s /mnt/187/opencompass-data/data ./data - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$? exit $overall_exit - name: Clear workspace if: always() diff --git a/autotest/conftest.py b/autotest/conftest.py index 36392ac1c1..716e149130 100644 --- a/autotest/conftest.py +++ b/autotest/conftest.py @@ -26,10 +26,6 @@ def config(): env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) config_copy = copy.deepcopy(env_config) - github_run_id = os.environ.get('GITHUB_RUN_ID', 'local_run') - if 'log_path' in config_copy: - config_copy['log_path'] = os.path.join(config_copy['log_path'], str(github_run_id)) - os.makedirs(config_copy['log_path'], exist_ok=True) return config_copy diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 79444e97f9..527bb64994 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -8,7 +8,7 @@ DEFAULT_PORT = 23333 -def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, work_dir=None): +def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, communicator, work_dir=None): status = '✅ PASS' if result else '❌ FAIL' metrics = {} @@ -56,18 +56,20 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, w summary_dataset_metrics = ' | '.join(dataset_metrics) summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None) - summary_line = f'| {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n' + summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n' # noqa: E501 if summary_file: write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0 with open(summary_file, 'a') as f: if write_header: dash_line = '-----|' * (len(metrics.keys())) f.write('## Model Evaluation Results\n') - f.write(f'| Model | Backend | TP | Status | {summary_dataset_name} |\n') - f.write(f'|-------|---------|----|--------|{dash_line}\n') + f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n') + f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n') f.write(summary_line) else: - print(f'Summary: {model_name} | {backend_type} | TP{tp_num} | {status} | {summary_dataset_metrics}') + print( + f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}' # noqa: E501 + ) def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT): @@ -196,7 +198,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA error_lines = ' | '.join(error_lines[:3]) final_msg += f'\nLog errors: {error_lines}' - write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator, + work_dir) return final_result, final_msg @@ -208,10 +211,12 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA timeout_msg = (f'Evaluation timed out for {model_name} ' f'after 7200 seconds') if work_dir: - write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator, + work_dir) return False, timeout_msg except Exception as e: error_msg = f'Error during evaluation for {model_name}: {str(e)}' if work_dir: - write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, work_dir) + write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator, + work_dir) return False, error_msg From 54c9c14d210d58b2074511dbd66a79f7b65770a0 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 26 Sep 2025 09:01:50 +0800 Subject: [PATCH 27/28] update --- autotest/conftest.py | 5 +---- autotest/interface/pipeline/test_pipeline_func.py | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/autotest/conftest.py b/autotest/conftest.py index 716e149130..e4c23c13be 100644 --- a/autotest/conftest.py +++ b/autotest/conftest.py @@ -1,4 +1,3 @@ -import copy import os import pytest @@ -25,9 +24,7 @@ def config(): with open(config_path) as f: env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) - config_copy = copy.deepcopy(env_config) - - return config_copy + return env_config @pytest.fixture(scope='session') diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 42f6e95d86..ff97e8d7f1 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -327,8 +327,9 @@ def run_pipeline_testcase(config, model, backend, file_name): result = True for i in range(2): - result &= response[i].finish_reason == 'length' + result &= response[i].finish_reason == 'error' result &= response[i].generate_token_len == 0 + result &= response[i].text == 'internal error happened, status code ResponseType.INPUT_LENGTH_ERROR' save_pipeline_common_log(config, file_name, result, response) del pipe _clear_device_cache() @@ -422,7 +423,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): # test bad_words gen_config = GenerationConfig(bad_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], gen_config=gen_config) - result = '蒲' in response[0].text or 'SenseTime' in response[0].text + result = True for i in range(2): result &= '浦' not in response[i].text and ' and' not in response[i].text and ' to ' not in response[i].text save_pipeline_common_log(config, file_name, result, response) From 055745ac856d19a85c30015b3ea71cb284661051 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 10 Oct 2025 10:02:05 +0800 Subject: [PATCH 28/28] update api outputfolder name --- .github/workflows/api_eval_h800.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_h800.yml index 906d3da1a6..37b7832fc5 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_h800.yml @@ -23,7 +23,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy