NVIDIA · shuyixiong · Dec 21, 2025 · Dec 19, 2025 · Dec 20, 2025
@@ -2101,9 +2101,9 @@ def load_expert_w3_w1_weight_scale_nvfp4(
                                             TensorParallelMode.COLUMN,
                                             device=device)
 
-        cast_w3_weight_scale = w3_weight_scale.view(
+        cast_w3_weight_scale = w3_weight_scale.contiguous().view(
             dst_w3_w1_weight_scale.dtype)
-        cast_w1_weight_scale = w1_weight_scale.view(
+        cast_w1_weight_scale = w1_weight_scale.contiguous().view(
             dst_w3_w1_weight_scale.dtype)
         cast_w31_weight_scale = torch.cat(
             [cast_w3_weight_scale, cast_w1_weight_scale], dim=0)
@@ -2165,8 +2165,10 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
                                             TensorParallelMode.COLUMN,
                                             device=device)
 
-        cast_w1_weight_shard = w1_weight_shard.view(dst_w3_w1_weight.dtype)
-        cast_w3_weight_shard = w3_weight_shard.view(dst_w3_w1_weight.dtype)
+        cast_w1_weight_shard = w1_weight_shard.contiguous().view(
+            dst_w3_w1_weight.dtype)
+        cast_w3_weight_shard = w3_weight_shard.contiguous().view(
+            dst_w3_w1_weight.dtype)
         cast_w31_weight_shard = torch.cat(
             [cast_w3_weight_shard, cast_w1_weight_shard], dim=0)
         cast_w31_weight_shard = self._maybe_padding_shape(

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -476,8 +476,6 @@ accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype SKIP (http
 accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
 unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py::test_build_run_llama4_vlm SKIP (https://nvbugs/5747878)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5702793)
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)