FlagOpen · zihugithub · Oct 14, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 17, 2025
@@ -6,13 +6,13 @@ on:
     paths:
       - '.github/workflows/*metax*.yml'
       - 'hardware/Metax_C550/**'
-      - 'tests/**'
+      - 'tests/Metax_C550/**'
   pull_request:
     branches: ["main"]
     paths:
       - '.github/workflows/*metax*.yml'
       - 'hardware/Metax_C550/**'
-      - 'tests/**'
+      - 'tests/Metax_C550/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
@@ -36,15 +36,15 @@ jobs:
     strategy:
       matrix:
         task:
-          - deepseek_r1_distill_qwen-metax
+          - deepseek_r1_distill_qwen
           # TODO: Need update for [email protected]
-          - deepseek_r1_distill_qwen-flaggems-metax
-          - opi_llama3_1_instruct-metax
-          - opi_llama3_1_instruct-flaggems-metax
-          - qwen3-metax
-          - qwen3-flaggems-metax
-          - robobrain2-metax  # TP1
-          - robobrain2-flaggems-metax  # TP1
+          - deepseek_r1_distill_qwen-flaggems
+          - opi_llama3_1_instruct
+          - opi_llama3_1_instruct-flaggems
+          - qwen3
+          - qwen3-flaggems
+          - robobrain2  # TP1
+          - robobrain2-flaggems  # TP1
     name: "inference-${{ matrix.task }}"
     with:
       task: ${{ matrix.task }}

@@ -146,7 +146,7 @@ jobs:
             echo "Unknown backend type: ${{ inputs.type }}"
             exit 1
           fi
-          cd /__w/FlagScale/FlagScale
+          cd /__w/FlagScale/FlagScale/tests/Metax_C550/
           tests/scripts/functional_tests/test_task.sh --type ${{ inputs.type }} --task ${{ inputs.task }}
           exit_code=$?
           echo "Exit code: $exit_code"

@@ -0,0 +1,51 @@
+defaults:
+  - _self_
+  - inference: 7b-tp2
+
+experiment:
+  exp_name: deepseek_r1_distill_qwen-flaggems
+  exp_dir: tests/functional_tests/test_cases/inference/deepseek_r1_distill_qwen-flaggems/results_test/7b-tp2
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_aquila.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start:
+      source /opt/conda/bin/activate flagscale-inference
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    GEMS_VENDOR: "metax"
+    # Quantitative perception training related
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    # GPU parallel control
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    # Basic randomness control
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
@@ -0,0 +1,20 @@
+llm:
+  model: /home/gitlab-runner/data/DeepSeek-R1-Distill-Qwen-7B
+  tokenizer: /home/gitlab-runner/data/DeepSeek-R1-Distill-Qwen-7B
+  trust_remote_code: true
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  gpu_memory_utilization: 0.9
+  seed: 1234
+  enforce_eager: true
+
+generate:
+  prompts: [
+    "The president of the United States",
+    "The capital of France",
+  ]
+  sampling:
+    top_p: 0.01
+    top_k: 1
+    temperature: 0.0
+    seed: 1234
@@ -0,0 +1,8 @@
+**************************************************
+output.prompt='The president of the United States'
+output.outputs[0].text=', George W. Bush, is known for his leadership in the 19'
+output.outputs[0].token_ids=[11, 9857, 467, 13, 14079, 11, 374, 3881, 369, 806, 11438, 304, 279, 220, 16, 24]
+**************************************************
+output.prompt='The capital of France'
+output.outputs[0].text=', Paris, is known for its iconic landmarks such as the Eiffel Tower'
+output.outputs[0].token_ids=[11, 12095, 11, 374, 3881, 369, 1181, 26277, 59924, 1741, 438, 279, 468, 3092, 301, 21938]
@@ -0,0 +1,49 @@
+defaults:
+  - _self_
+  - inference: 7b-tp2
+
+experiment:
+  exp_name: deepseek_r1_distill_qwen
+  exp_dir: tests/functional_tests/test_cases/inference/deepseek_r1_distill_qwen/results_test/7b-tp2
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_aquila.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start:
+      source /opt/conda/bin/activate flagscale-inference
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    # Quantitative perception training related
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    # GPU parallel control
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    # Basic randomness control
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
@@ -0,0 +1,20 @@
+llm:
+  model: /home/gitlab-runner/data/DeepSeek-R1-Distill-Qwen-7B
+  tokenizer: /home/gitlab-runner/data/DeepSeek-R1-Distill-Qwen-7B
+  trust_remote_code: true
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  gpu_memory_utilization: 0.9
+  seed: 1234
+  enforce_eager: true
+
+generate:
+  prompts: [
+    "The president of the United States",
+    "The capital of France",
+  ]
+  sampling:
+    top_p: 0.1
+    top_k: 1
+    temperature: 0.0
+    seed: 1234
@@ -0,0 +1,8 @@
+**************************************************
+output.prompt='The president of the United States'
+output.outputs[0].text=', George W. Bush, is known for his leadership in the 19'
+output.outputs[0].token_ids=[11, 9857, 467, 13, 14079, 11, 374, 3881, 369, 806, 11438, 304, 279, 220, 16, 24]
+**************************************************
+output.prompt='The capital of France'
+output.outputs[0].text=', Paris, is known for its iconic landmarks such as the Eiffel Tower'
+output.outputs[0].token_ids=[11, 12095, 11, 374, 3881, 369, 1181, 26277, 59924, 1741, 438, 279, 468, 3092, 301, 21938]
@@ -0,0 +1,36 @@
+defaults:
+  - _self_
+  - inference: 8b-tp2
+
+experiment:
+  exp_name: opi_llama3_1_instruct-flaggems
+  exp_dir: tests/functional_tests/test_cases/inference/opi_llama3_1_instruct-flaggems/results_test/8b-tp2
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_aquila.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start:
+      source /opt/conda/bin/activate flagscale-inference
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    GEMS_VENDOR: "metax"
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
@@ -0,0 +1,19 @@
+llm:
+  model: /home/gitlab-runner/data/OPI-Llama-3.1-8B-Instruct
+  tokenizer: /home/gitlab-runner/data/OPI-Llama-3.1-8B-Instruct
+  trust_remote_code: true
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  max_model_len: 2048
+  gpu_memory_utilization: 0.9
+  seed: 1234
+  enforce_eager: true
+
+generate:
+  prompts: [
+    "The president of the United States",
+    "The capital of France",
+]
+  sampling:
+    top_k: 1
+    seed: 1234
@@ -0,0 +1,8 @@
+**************************************************
+output.prompt='The president of the United States'
+output.outputs[0].text=' is responsible for the repression of the spread of nuclear arms and the maintenance of the'
+output.outputs[0].token_ids=[374, 8647, 369, 279, 72498, 315, 279, 9041, 315, 11499, 11977, 323, 279, 13709, 315, 279]
+**************************************************
+output.prompt='The capital of France'
+output.outputs[0].text=', which is also a major component of the globalized culture, is a hub'
+output.outputs[0].token_ids=[11, 902, 374, 1101, 264, 3682, 3777, 315, 279, 3728, 1534, 7829, 11, 374, 264, 19240]
@@ -0,0 +1,35 @@
+defaults:
+  - _self_
+  - inference: 8b-tp2
+
+experiment:
+  exp_name: opi_llama3_1_instruct
+  exp_dir: tests/functional_tests/test_cases/inference/opi_llama3_1_instruct/results_test/8b-tp2
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_aquila.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start:
+      source /opt/conda/bin/activate flagscale-inference
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    GEMS_VENDOR: "metax"
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
@@ -0,0 +1,19 @@
+llm:
+  model: /home/gitlab-runner/data/OPI-Llama-3.1-8B-Instruct
+  tokenizer: /home/gitlab-runner/data/OPI-Llama-3.1-8B-Instruct
+  trust_remote_code: true
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  max_model_len: 2048
+  gpu_memory_utilization: 0.9
+  seed: 1234
+  enforce_eager: true
+
+generate:
+  prompts: [
+    "The president of the United States",
+    "The capital of France",
+]
+  sampling:
+    top_k: 1
+    seed: 1234
@@ -0,0 +1,8 @@
+**************************************************
+output.prompt='The president of the United States'
+output.outputs[0].text=' is responsible for the repression of the spread of nuclear arms and the maintenance of the'
+output.outputs[0].token_ids=[374, 8647, 369, 279, 72498, 315, 279, 9041, 315, 11499, 11977, 323, 279, 13709, 315, 279]
+**************************************************
+output.prompt='The capital of France'
+output.outputs[0].text=', which is also a major component of the globalized culture, is a hub'
+output.outputs[0].token_ids=[11, 902, 374, 1101, 264, 3682, 3777, 315, 279, 3728, 1534, 7829, 11, 374, 264, 19240]
@@ -0,0 +1,50 @@
+defaults:
+  - _self_
+  - inference: 4b-tp2
+
+experiment:
+  exp_name: qwen3-flaggems
+  exp_dir: tests/functional_tests/test_cases/inference/qwen3-flaggems/results_test/4b-tp2
+  task:
+    type: inference
+    backend: vllm
+    entrypoint: flagscale/inference/inference_aquila.py
+  runner:
+    hostfile: null
+  cmds:
+    before_start:
+      source /opt/conda/bin/activate flagscale-inference
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    GEMS_VENDOR: "metax"
+    # Quantitative perception training related
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    # GPU parallel control
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    # Basic randomness control
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    MAGIC_CACHE: disabled
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra