Skip to content

Commit 5b157c2

Browse files
committed
adjust
Signed-off-by: Ivy Zhang <[email protected]>
1 parent 391a36d commit 5b157c2

File tree

9 files changed

+22
-59
lines changed

9 files changed

+22
-59
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ google/gemma-3-1b-it:
192192
- accuracy: 25.52 # score getting from lm-eval with HF implementation
193193
- quant_algo: FP8
194194
kv_cache_quant_algo: FP8
195-
accuracy: 25.52
195+
accuracy: 23.96
196196
google/gemma-3-27b-it:
197197
- accuracy: 91.66
198198
- quant_algo: FP8

tests/integration/defs/accuracy/references/json_mode_eval.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,11 @@ deepseek-ai/DeepSeek-V3-Lite:
1111
google/gemma-3-1b-it:
1212
- quant_algo: FP8
1313
kv_cache_quant_algo: FP8
14-
accuracy: 63.00
14+
accuracy: 61.00
1515
GPT-OSS/120B-MXFP4:
16+
- quant_algo: W4A16_MXFP4
17+
spec_dec_algo: Eagle
18+
accuracy: 62.00
1619
- quant_algo: W4A8_MXFP4_MXFP8
1720
spec_dec_algo: Eagle
18-
accuracy: 56.00
21+
accuracy: 62.00

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
10141014

10151015
@pytest.mark.skip_less_device(8)
10161016
@pytest.mark.parametrize("block_reuse", [False, True])
1017-
def test_mxfp4(self, block_reuse, mocker):
1017+
def test_auto_dtype(self, block_reuse, mocker):
10181018
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
10191019
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
10201020
{"scores_filter": "exact_match,flexible-extract"})

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 2 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,51 +1068,6 @@ def test_fp8_prequantized(self):
10681068
task = MMLU(self.MODEL_NAME)
10691069
task.evaluate(llm)
10701070

1071-
def test_fp8_vswa_reuse(self):
1072-
# NOTE: Test with VSWA kv cache config.
1073-
kv_cache_config = KvCacheConfig(
1074-
enable_block_reuse=True,
1075-
max_attention_window=[1024, 1024, 1024, 1024, 1024, 65536],
1076-
)
1077-
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-fp8/"
1078-
with LLM(prequantized_model_path,
1079-
kv_cache_config=kv_cache_config) as llm:
1080-
task = GSM8K(self.MODEL_NAME)
1081-
task.evaluate(llm)
1082-
task = MMLU(self.MODEL_NAME)
1083-
task.evaluate(llm)
1084-
1085-
@pytest.mark.parametrize("backend", ["xgrammar"])
1086-
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
1087-
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
1088-
kv_cache_config = KvCacheConfig(
1089-
enable_block_reuse=True,
1090-
max_attention_window=[1024, 1024, 1024, 1024, 1024, 65536],
1091-
)
1092-
cuda_graph_config = CudaGraphConfig(enable_padding=True)
1093-
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-fp8/"
1094-
llm = LLM(prequantized_model_path,
1095-
guided_decoding_backend=backend,
1096-
kv_cache_config=kv_cache_config,
1097-
cuda_graph_config=cuda_graph_config)
1098-
with llm:
1099-
task = JsonModeEval(self.MODEL_NAME)
1100-
task.evaluate(llm)
1101-
1102-
def test_nvfp4_vswa_reuse(self):
1103-
# NOTE: Test with VSWA kv cache config.
1104-
kv_cache_config = KvCacheConfig(
1105-
enable_block_reuse=True,
1106-
max_attention_window=[1024, 1024, 1024, 1024, 1024, 65536],
1107-
)
1108-
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-FP4/"
1109-
with LLM(prequantized_model_path,
1110-
kv_cache_config=kv_cache_config) as llm:
1111-
task = GSM8K(self.MODEL_NAME)
1112-
task.evaluate(llm)
1113-
task = MMLU(self.MODEL_NAME)
1114-
task.evaluate(llm)
1115-
11161071

11171072
class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
11181073
MODEL_NAME = "google/gemma-3-1b-it"
@@ -1168,12 +1123,13 @@ def test_fp8_vswa_reuse(self):
11681123
@pytest.mark.parametrize("backend", ["xgrammar"])
11691124
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
11701125
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
1126+
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
11711127
kv_cache_config = KvCacheConfig(
11721128
enable_block_reuse=True,
11731129
max_attention_window=[512, 512, 512, 512, 512, 32768],
11741130
)
11751131
cuda_graph_config = CudaGraphConfig(enable_padding=True)
1176-
llm = LLM(self.MODEL_PATH,
1132+
llm = LLM(prequantized_model_path,
11771133
guided_decoding_backend=backend,
11781134
kv_cache_config=kv_cache_config,
11791135
cuda_graph_config=cuda_graph_config)

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
426426
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
427427
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
428428
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
429-
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse
429+
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
430430
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
431431
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
432432
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
@@ -534,8 +534,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding
534534
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix
535535
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
536536
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
537-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[True]
538-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[False]
537+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
538+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
539539
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
540540
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
541541
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,8 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
391391
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
392392
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
393393
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
394-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[True]
395-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[False]
394+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
395+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
396396
test_e2e.py::test_openai_chat_harmony
397397
test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-8]
398398
test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]

tests/integration/test_lists/qa/llm_function_rtx6k.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
4343
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
4444
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
4545
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
46+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
47+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
48+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
49+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
4650
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
4751
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
4852
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ l0_dgx_h200:
3232
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4]
3333
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
3434
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
35-
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[False]
36-
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[True]
35+
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
36+
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
3737
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap]
3838
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
3939
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]

tests/integration/test_lists/waives.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
326326
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
327327
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
328328
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
329-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[True] SKIP (https://nvbugs/5644632)
330-
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_mxfp4[False] SKIP (https://nvbugs/5644632)
329+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5644632)
330+
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632)
331331
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
332332
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
333333
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136)

0 commit comments

Comments
 (0)