@@ -1068,51 +1068,6 @@ def test_fp8_prequantized(self):
10681068 task = MMLU (self .MODEL_NAME )
10691069 task .evaluate (llm )
10701070
1071- def test_fp8_vswa_reuse (self ):
1072- # NOTE: Test with VSWA kv cache config.
1073- kv_cache_config = KvCacheConfig (
1074- enable_block_reuse = True ,
1075- max_attention_window = [1024 , 1024 , 1024 , 1024 , 1024 , 65536 ],
1076- )
1077- prequantized_model_path = f"{ llm_models_root ()} /gemma/gemma-3-27b-it-fp8/"
1078- with LLM (prequantized_model_path ,
1079- kv_cache_config = kv_cache_config ) as llm :
1080- task = GSM8K (self .MODEL_NAME )
1081- task .evaluate (llm )
1082- task = MMLU (self .MODEL_NAME )
1083- task .evaluate (llm )
1084-
1085- @pytest .mark .parametrize ("backend" , ["xgrammar" ])
1086- def test_fp8_guided_decoding_vswa_reuse (self , backend : str , mocker ):
1087- mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
1088- kv_cache_config = KvCacheConfig (
1089- enable_block_reuse = True ,
1090- max_attention_window = [1024 , 1024 , 1024 , 1024 , 1024 , 65536 ],
1091- )
1092- cuda_graph_config = CudaGraphConfig (enable_padding = True )
1093- prequantized_model_path = f"{ llm_models_root ()} /gemma/gemma-3-27b-it-fp8/"
1094- llm = LLM (prequantized_model_path ,
1095- guided_decoding_backend = backend ,
1096- kv_cache_config = kv_cache_config ,
1097- cuda_graph_config = cuda_graph_config )
1098- with llm :
1099- task = JsonModeEval (self .MODEL_NAME )
1100- task .evaluate (llm )
1101-
1102- def test_nvfp4_vswa_reuse (self ):
1103- # NOTE: Test with VSWA kv cache config.
1104- kv_cache_config = KvCacheConfig (
1105- enable_block_reuse = True ,
1106- max_attention_window = [1024 , 1024 , 1024 , 1024 , 1024 , 65536 ],
1107- )
1108- prequantized_model_path = f"{ llm_models_root ()} /gemma/gemma-3-27b-it-FP4/"
1109- with LLM (prequantized_model_path ,
1110- kv_cache_config = kv_cache_config ) as llm :
1111- task = GSM8K (self .MODEL_NAME )
1112- task .evaluate (llm )
1113- task = MMLU (self .MODEL_NAME )
1114- task .evaluate (llm )
1115-
11161071
11171072class TestGemma3_1BInstruct (LlmapiAccuracyTestHarness ):
11181073 MODEL_NAME = "google/gemma-3-1b-it"
@@ -1168,12 +1123,13 @@ def test_fp8_vswa_reuse(self):
11681123 @pytest .mark .parametrize ("backend" , ["xgrammar" ])
11691124 def test_fp8_guided_decoding_vswa_reuse (self , backend : str , mocker ):
11701125 mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
1126+ prequantized_model_path = f"{ llm_models_root ()} /gemma/gemma-3-1b-it-fp8/"
11711127 kv_cache_config = KvCacheConfig (
11721128 enable_block_reuse = True ,
11731129 max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
11741130 )
11751131 cuda_graph_config = CudaGraphConfig (enable_padding = True )
1176- llm = LLM (self . MODEL_PATH ,
1132+ llm = LLM (prequantized_model_path ,
11771133 guided_decoding_backend = backend ,
11781134 kv_cache_config = kv_cache_config ,
11791135 cuda_graph_config = cuda_graph_config )
0 commit comments