infra: Add test stages for sm120 (#3533)

EmmaQiaoCh · web-flow · commit 442386d30234 · 2025-04-23T01:26:12.000+08:00
* Add test stages for sm120

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Update chip name and config name

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Split tests to gb202 and gb203

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Don't flash driver for rtx-5090

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Skip the failed cases

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Change the test stage names

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Reduce 5080 jobs and add back gpu list which doesn't support dynamic driver flashing

Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;

* Skip failed case on gb202

Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;

* Fix condition to dynamic driver flashing

Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;

---------

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;
Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -279,7 +279,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
         targetCould = "kubernetes"
 
         // The following GPU types doesn't support dynamic driver flashing.
-        if (type == "b100-ts2" || type.contains("dgx-h100") || type.contains("dgx-h200") || type == "gh200" ) {
+        if (type.contains("dgx-h100") || type.contains("dgx-h200") || type in ["b100-ts2", "gh200", "rtx-5080", "rtx-5090"]) {
             selectors = """
                     kubernetes.io/arch: ${arch}
                     kubernetes.io/os: linux
@@ -1219,6 +1219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 2],
         "B200_PCIe-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2],
         "B200_PCIe-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2],
+        "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1],
+        "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
+        "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
         // Currently post-merge test stages only run tests with "stage: post_merge" mako
         // in the test-db. This behavior may change in the future.
         "A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 2],
diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml
@@ -0,0 +1,21 @@
+version: 0.0.1
+l0_gb202:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*gb202*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+  tests:
+  # ------------- PyTorch tests ---------------
+  - unittest/_torch/modeling -k "modeling_mllama"
+  - unittest/_torch/modeling -k "modeling_out_of_tree"
+  # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
+  - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
+  - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
diff --git a/tests/integration/test_lists/test-db/l0_gb203.yml b/tests/integration/test_lists/test-db/l0_gb203.yml
@@ -0,0 +1,38 @@
+version: 0.0.1
+l0_gb203:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*gb203*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: tensorrt
+  tests:
+  # ------------- TRT tests ---------------
+  - unittest/trt/attention/test_gpt_attention.py -k "partition0"
+  - unittest/trt/attention/test_gpt_attention.py -k "partition1"
+  - unittest/trt/attention/test_gpt_attention.py -k "partition2"
+  - unittest/trt/attention/test_gpt_attention.py -k "partition3"
+  - unittest/trt/attention/test_gpt_attention.py -k "xqa_generic"
+  # - unittest/trt/quantization # https://nvbugs/5234573
+  # - unittest/trt/functional # https://nvbugs/5234573
+  - examples/test_llama.py::test_llm_llama_v1_1gpu_kv_cache_reuse_with_prompt_table[llama-7b]
+  - examples/test_llama.py::test_llm_llama_v3_1_1node_single_gpu[llama-3.2-1b-disable_fp8]
+  - examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int4-nb:1]
+  - examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int8-nb:1]
+  - examples/test_llama.py::test_llm_llama_1gpu[llama-3.1-8b-instruct-hf-fp8-enable_fp8-float16-summarization-nb:1]
+  # - examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora] # https://nvbugs/5234573
+  # - examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_1.5b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] # https://nvbugs/5234573
+  - test_e2e.py::test_llmapi_quickstart
+  - test_e2e.py::test_llmapi_example_inference
+  - test_e2e.py::test_llmapi_example_inference_async
+  - test_e2e.py::test_llmapi_example_inference_async_streaming
+  - test_e2e.py::test_llmapi_example_logits_processor
+  - test_e2e.py::test_llmapi_example_multilora
+  - test_e2e.py::test_llmapi_example_guided_decoding
+  - test_e2e.py::test_llmapi_example_customize