NVIDIA
diff --git a/‎benchmarks/linear/benchmark_grouped_linear.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/linear/benchmark_grouped_linear.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_pytorch_debug_unittest/test.sh‎
Lines changed: 8 additions & 0 deletions b/‎qa/L0_pytorch_debug_unittest/test.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 5 additions & 0 deletions b/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/jax/conftest.py‎
Lines changed: 53 additions & 0 deletions b/‎tests/jax/conftest.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎tests/jax/test_custom_call_compute.py‎
Lines changed: 13 additions & 24 deletions b/‎tests/jax/test_custom_call_compute.py‎
Lines changed: 13 additions & 24 deletions
diff --git a/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 0 additions & 2 deletions b/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/jax/test_distributed_softmax.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/jax/test_distributed_softmax.py‎
Lines changed: 4 additions & 4 deletions
@@ -247,7 +247,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
     num_gemms_list = [8]
 
     if args.profile:
-        mkns = [(4096, 4096, 4096)]
+        mkns = [(4096 * 8, 4096, 4096)]
         # in profile mode, only run one recipe specified in args.recipe
         assert args.recipe != "all", (
             "In profile mode, only one recipe can be specified, please specify the recipe as"
 
@@ -14,11 +14,19 @@
 
 FAIL=0
 
+# It is not installed as a requirement,
+# because it is not available on PyPI.
+pip uninstall -y nvdlfw-inspect
+pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+
 pip install pytest==8.2.1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+
 
 # standard sanity and numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
 
@@ -21,6 +21,11 @@ FAILED_CASES=""
 mkdir -p "$XML_LOG_DIR"
 
 
+# It is not installed as a requirement,
+# because it is not available on PyPI.
+pip uninstall -y nvdlfw-inspect
+pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 
@@ -5,6 +5,8 @@
 import os
 import jax
 import pytest
+from collections import defaultdict
+import time
 
 
 import transformer_engine.jax
@@ -32,3 +34,54 @@ def enable_fused_attn_after_hopper():
     yield
     if "NVTE_FUSED_ATTN" in os.environ:
         del os.environ["NVTE_FUSED_ATTN"]
+
+
+class TestTimingPlugin:
+    """
+    Plugin to measure test execution time. Enable test timing by setting NVTE_JAX_TEST_TIMING=1
+    in the environment.
+    """
+
+    def __init__(self):
+        self.test_timings = defaultdict(list)
+
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_runtest_setup(self, item):
+        item._timing_start = time.time()
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_runtest_teardown(self, item, nextitem):
+        if hasattr(item, "_timing_start"):
+            duration = time.time() - item._timing_start
+
+            # Extract base function name without parameters
+            test_name = item.name
+            if "[" in test_name:
+                base_name = test_name.split("[")[0]
+            else:
+                base_name = test_name
+
+            self.test_timings[base_name].append(duration)
+
+    def pytest_sessionfinish(self, session, exitstatus):
+        print("\n" + "=" * 80)
+        print("TEST RUNTIME SUMMARY (grouped by function)")
+        print("=" * 80)
+
+        total_overall = 0
+        for test_name, durations in sorted(self.test_timings.items()):
+            total_time = sum(durations)
+            count = len(durations)
+            avg_time = total_time / count if count > 0 else 0
+            total_overall += total_time
+
+            print(f"{test_name:<60} | {count:3}x | {total_time:7.2f}s | avg: {avg_time:6.2f}s")
+
+        print("=" * 80)
+        print(f"{'TOTAL RUNTIME':<60} | {'':>3}  | {total_overall:7.2f}s |")
+        print("=" * 80)
+
+
+def pytest_configure(config):
+    if os.getenv("NVTE_JAX_TEST_TIMING", "0") == "1":
+        config.pluginmanager.register(TestTimingPlugin(), "test_timing")
@@ -673,10 +673,6 @@ def test_grouped_qdq(
             n_groups=n_groups,
         )
 
-        # grouped_quantize does not work with cudaGraph yet, so the jitting will breaks
-        # To test it locally, export XLA_FLAGS="--xla_gpu_enable_command_buffer= $XLA_FLAGS" to
-        # disable cudaGraph, then use the following jitted function
-
         scaled_tensor = tex.grouped_quantize(
             x, group_sizes=group_sizes, flatten_axis=flatten_axis, quantizer=grouped_quantizer
         )
@@ -1312,16 +1308,14 @@ def test_grouped_gemm_fp16(self, dtype, input_shape, layout):
         )
         ref_out = self._ref_grouped_dense(lhs, rhs, None, group_sizes, contracting_dims)
 
-        # grouped_gemm does not work with cudaGraph yet, so the jitting will breaks
-        # To test it locally, export XLA_FLAGS="--xla_gpu_enable_command_buffer= $XLA_FLAGS" to
-        # disable cudaGraph, then use the following jitted function
-
         # jitting grouped_gemm
-        # prim_out = jax.jit(tex.grouped_gemm, static_argnames=("contracting_dims",))(
-        #     lhs, rhs, group_sizes, contracting_dims,
-        # )
+        prim_out = jax.jit(tex.grouped_gemm, static_argnames=("contracting_dims",))(
+            lhs,
+            rhs,
+            group_sizes,
+            contracting_dims,
+        )
 
-        prim_out = tex.grouped_gemm(lhs, rhs, group_sizes, contracting_dims)
         self._assert_grouped_gemm_output(prim_out, group_sizes, ref_out, dtype)
 
     @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
@@ -1350,12 +1344,7 @@ def test_grouped_gemm_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape, layout
         )
         ref_out = self._ref_grouped_dense(lhs, rhs, None, group_sizes, contracting_dims)
 
-        # jitting grouped_gemm
-        # prim_out = jax.jit(tex.grouped_gemm, static_argnames=('contracting_dims',))(
-        #         lhs, rhs, group_sizes, contracting_dims, quantizer_set=quantizer_set
-        #         )
-
-        prim_out = tex.grouped_gemm(
+        prim_out = jax.jit(tex.grouped_gemm, static_argnames=("contracting_dims",))(
             lhs, rhs, group_sizes, contracting_dims, quantizer_set=quantizer_set
         )
 
@@ -1391,9 +1380,9 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape):
 
         value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2))
         # jitting the grouped_dense
-        # value_n_grad_prim_func = jit(value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)),
-        #                              static_argnums=(4,))
-        value_n_grad_prim_func = value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2))
+        value_n_grad_prim_func = jit(
+            value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,)
+        )
 
         ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func(
             x, kernel, bias, group_sizes, contracting_dims
@@ -1432,9 +1421,9 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape):
         value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2))
 
         # jitting the grouped_dense
-        # value_n_grad_prim_func = jit(value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)),
-        #                              static_argnums=(4,))
-        value_n_grad_prim_func = value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2))
+        value_n_grad_prim_func = jit(
+            value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,)
+        )
 
         ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func(
             x,
 
@@ -333,7 +333,6 @@ def _test_layernorm_mlp(
             with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
                 ln_mlp_single = LayerNormMLP(
                     layernorm_type=layernorm_type,
-                    transpose_batch_sequence=False,  # input: [batch, seqlen, hidden]
                     intermediate_dim=INTERMEDIATE,
                     activations=activation_type,
                     use_bias=use_bias,
@@ -352,7 +351,6 @@ def _test_layernorm_mlp(
             ):
                 ln_mlp_sharded = LayerNormMLP(
                     layernorm_type=layernorm_type,
-                    transpose_batch_sequence=False,
                     intermediate_dim=INTERMEDIATE,
                     activations=activation_type,
                     scale_axes=LN_SCALE_AXES,
 
@@ -135,7 +135,7 @@ def impl_test_softmax(
                         )
 
     @pytest.mark.parametrize("device_count,mesh_shape,mesh_axes,mesh_resource", generate_configs())
-    @pytest.mark.parametrize("data_shape", [[32, 12, 128, 128], [64, 16, 1024, 1024]])
+    @pytest.mark.parametrize("data_shape", [[32, 12, 128, 128], [8, 8, 1024, 1024]])
     @pytest.mark.parametrize(
         "softmax_type",
         [SoftmaxType.SCALED, SoftmaxType.SCALED_MASKED, SoftmaxType.SCALED_UPPER_TRIANG_MASKED],
@@ -168,14 +168,14 @@ def test_softmax(
             dtype,
             bad_sharding,
             broadcast_batch_mask,
-            use_shardy=False,
+            use_shardy=True,
         )
 
     @pytest.mark.parametrize("device_count,mesh_shape,mesh_axes,mesh_resource", generate_configs())
     @pytest.mark.parametrize("softmax_type", [SoftmaxType.SCALED, SoftmaxType.SCALED_MASKED])
     @pytest.mark.parametrize("bad_sharding", [False, True])
     @pytest.mark.parametrize("broadcast_batch_mask", [False, True])
-    def test_softmax_shardy(
+    def test_softmax_gspmd(
         self,
         device_count,
         mesh_shape,
@@ -196,5 +196,5 @@ def test_softmax_shardy(
             dtype=DTYPES[0],
             bad_sharding=bad_sharding,
             broadcast_batch_mask=broadcast_batch_mask,
-            use_shardy=True,
+            use_shardy=False,
         )