NVIDIA
diff --git a/‎qa/L0_pytorch_debug_unittest/test.sh‎
Lines changed: 8 additions & 0 deletions b/‎qa/L0_pytorch_debug_unittest/test.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 5 additions & 0 deletions b/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/pytorch/debug/test_api_features.py‎
Lines changed: 38 additions & 39 deletions b/‎tests/pytorch/debug/test_api_features.py‎
Lines changed: 38 additions & 39 deletions
diff --git a/‎tests/pytorch/debug/test_configs/log_config.yaml‎
Lines changed: 19 additions & 0 deletions b/‎tests/pytorch/debug/test_configs/log_config.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tests/pytorch/debug/test_configs/perf_config.yaml‎
Lines changed: 13 additions & 0 deletions b/‎tests/pytorch/debug/test_configs/perf_config.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/pytorch/debug/test_log.py‎
Lines changed: 59 additions & 0 deletions b/‎tests/pytorch/debug/test_log.py‎
Lines changed: 59 additions & 0 deletions
@@ -14,11 +14,19 @@
 
 FAIL=0
 
+# It is not installed as a requirement,
+# because it is not available on PyPI.
+pip uninstall -y nvdlfw-inspect
+pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+
 pip install pytest==8.2.1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+
 
 # standard sanity and numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
 
@@ -21,6 +21,11 @@ FAILED_CASES=""
 mkdir -p "$XML_LOG_DIR"
 
 
+# It is not installed as a requirement,
+# because it is not available on PyPI.
+pip uninstall -y nvdlfw-inspect
+pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
+
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 
@@ -24,22 +24,22 @@ def test_transformer_engine_no_config(feature_dirs):
         # FP8 enabled - true by the default
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="fprop", iteration=0
-        )
+        )[0]
 
-        # modify_tensor_enabled - False by default
+        # modify_tensor_enabled - (False, None) by default
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
 
-        # inspect_tensor_enabled - False by default
+        # inspect_tensor_enabled - (False, None) by default
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.attn.qkv", tensor_name="activation", iteration=0
-        )
+        )[0]
 
-        # inspect_tensor_postquantize - False by default
+        # inspect_tensor_postquantize - (False, None) by default
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
 
     finally:
         debug_api.end_debug()
@@ -51,24 +51,24 @@ def test_disable_fp8_gemm(configs_dir, feature_dirs):
 
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="fprop", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="dgrad", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="wgrad", iteration=0
-        )
+        )[0]
 
         # caching
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="fprop", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="dgrad", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="wgrad", iteration=0
-        )
+        )[0]
 
     finally:
         debug_api.end_debug()
@@ -80,22 +80,22 @@ def test_disable_fp8_layer(configs_dir, feature_dirs):
 
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.mlp.fc1", gemm="fprop", iteration=0
-        )
+        )[0]
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.mlp.fc1", gemm="wgrad", iteration=0
-        )
+        )[0]
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.mlp.fc1", gemm="dgrad", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="fprop", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="wgrad", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.attn.qkv", gemm="dgrad", iteration=0
-        )
+        )[0]
 
     finally:
         debug_api.end_debug()
@@ -111,22 +111,22 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
         # check modify_tensor_enabled
         assert debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
         assert debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0
-        )
+        )[0]
         assert debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
-        )
+        )[0]
 
         # check modify_tensor
 
@@ -168,14 +168,14 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
             gemm="wgrad",
             tensor_name="gradient",
             iteration=0,
-        )
+        )[0]
 
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc4",
             gemm="fprop",
             tensor_name="activation",
             iteration=0,
-        )
+        )[0]
     finally:
         debug_api.end_debug()
 
@@ -191,11 +191,11 @@ def test_fake_quant(configs_dir, feature_dirs):
         # modify_tensor_enabled
         assert debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
 
         assert debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
-        )
+        )[0]
 
         # modify_tensor
         debug_api.transformer_engine.modify_tensor(
@@ -218,11 +218,11 @@ def test_fake_quant(configs_dir, feature_dirs):
 
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.fc2", gemm="wgrad", iteration=0
-        )
+        )[0]
         # caching
         assert debug_api.transformer_engine.fp8_gemm_enabled(
             "decoder.1.fc2", gemm="wgrad", iteration=0
-        )
+        )[0]
     finally:
         debug_api.end_debug()
 
@@ -265,21 +265,20 @@ def assert_empty():
         assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.mlp.fc1", tensor_name="activation", iteration=201
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.2.mlp.fc1", tensor_name="activation", iteration=200
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
-        )
+        )[0]
 
         expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
-        expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
 
         # TE FP8 tensor stats --
         assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
-        )
+        )[0]
         debug_api.transformer_engine.inspect_tensor_postquantize(
             "decoder.1.mlp.fc1",
             tensor=tensor_fp8,
@@ -295,10 +294,10 @@ def assert_empty():
 
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
-        )
+        )[0]
 
         # Second config in same yaml
         tensor = torch.rand((100, 100, 5))
@@ -328,7 +327,7 @@ def assert_empty():
 
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.7.mlp.fc1", tensor_name="weight", iteration=201
-        )
+        )[0]
         assert_empty()
 
     finally:
 
@@ -0,0 +1,19 @@
+test:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          start_step: 1
+          freq: 3
+    LogFp8TensorStats:
+      enabled: True
+      tensors: weight
+      stats: [underflows%]
+      start_step: 1
+      freq: 5
+  
@@ -0,0 +1,13 @@
+test:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*1
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          start_step: 0
+          freq: 100000
+  
@@ -0,0 +1,59 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+import pytest
+import torch
+import transformer_engine.pytorch as te
+import tempfile
+import os
+
+import nvdlfw_inspect.api as debug_api
+
+from transformer_engine.debug.pytorch.debug_state import TEDebugState
+
+
+@pytest.mark.parametrize("layer", ["linear", "transformer"])
+def test_log_every_3_or_5_layers(layer, configs_dir, feature_dirs):
+    # If layer does not invoke any feature in current iteration,
+    # then it changed into non-debug mode.
+    # This test checks whether this works correctly -
+    # non-quantized statistics should be logged every 3 iterations,
+    # and quantized statistics should be logged every 5 iterations.
+    with tempfile.TemporaryDirectory() as temp_dir:
+        debug_api.initialize(
+            config_file=configs_dir + "/log_config.yaml",
+            feature_dirs=feature_dirs,
+            log_dir=temp_dir,
+        )
+
+        if layer == "linear":
+            model = te.Linear(128, 128, name="linear1")
+        elif layer == "transformer":
+            model = te.TransformerLayer(128, 128, 4, name="transformer1")
+        else:
+            raise ValueError(f"Invalid layer: {layer}")
+
+        for i in range(11):
+            x = torch.randn(4, 128, 128).cuda()
+            with te.fp8_autocast(enabled=True):
+                y = model(x)
+            y.sum().backward()
+            debug_api.step()
+
+        with open(
+            os.path.join(
+                temp_dir, "nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log"
+            ),
+            "r",
+        ) as f:
+            file_content = f.read()
+            for i in range(1, 11):
+                if i % 3 == 0 or i % 5 == 0:
+                    assert f"iteration={i:06d}" in file_content
+                else:
+                    assert f"iteration={i:06d}" not in file_content
+
+    debug_api.end_debug()
+    TEDebugState._reset()