Skip to content

Commit b6b3abc

Browse files
pggPLpre-commit-ci[bot]ksivaman
authored
[PyTorch debug] Improve precision debug tools performance (#1909)
* turn on userbuffers for layers without debug Signed-off-by: Pawel Gadzinski <[email protected]> * code drop Signed-off-by: Pawel Gadzinski <[email protected]> * working change Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests and fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * update nvinspect version Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * fixes Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fix Signed-off-by: Pawel Gadzinski <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <[email protected]> * fix ci Signed-off-by: Pawel Gadzinski <[email protected]> --------- Signed-off-by: Pawel Gadzinski <[email protected]> Signed-off-by: Kirthi Shankar Sivamani <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <[email protected]>
1 parent 9f9b481 commit b6b3abc

File tree

26 files changed

+740
-238
lines changed

26 files changed

+740
-238
lines changed

qa/L0_pytorch_debug_unittest/test.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,19 @@
1414

1515
FAIL=0
1616

17+
# It is not installed as a requirement,
18+
# because it is not available on PyPI.
19+
pip uninstall -y nvdlfw-inspect
20+
pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
21+
1722
pip install pytest==8.2.1
1823
pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
1924
pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
2025
pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
2126
NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
27+
pytest -v -s $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
28+
pytest -v -s $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
29+
2230

2331
# standard sanity and numerics tests with initialized debug
2432
NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1

qa/L1_pytorch_distributed_unittest/test.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ FAILED_CASES=""
2121
mkdir -p "$XML_LOG_DIR"
2222

2323

24+
# It is not installed as a requirement,
25+
# because it is not available on PyPI.
26+
pip uninstall -y nvdlfw-inspect
27+
pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
28+
2429
pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
2530

2631
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"

tests/pytorch/debug/test_api_features.py

Lines changed: 38 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,22 @@ def test_transformer_engine_no_config(feature_dirs):
2424
# FP8 enabled - true by the default
2525
assert debug_api.transformer_engine.fp8_gemm_enabled(
2626
"decoder.1.attn.qkv", gemm="fprop", iteration=0
27-
)
27+
)[0]
2828

29-
# modify_tensor_enabled - False by default
29+
# modify_tensor_enabled - (False, None) by default
3030
assert not debug_api.transformer_engine.modify_tensor_enabled(
3131
"decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
32-
)
32+
)[0]
3333

34-
# inspect_tensor_enabled - False by default
34+
# inspect_tensor_enabled - (False, None) by default
3535
assert not debug_api.transformer_engine.inspect_tensor_enabled(
3636
"decoder.1.attn.qkv", tensor_name="activation", iteration=0
37-
)
37+
)[0]
3838

39-
# inspect_tensor_postquantize - False by default
39+
# inspect_tensor_postquantize - (False, None) by default
4040
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
4141
"decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
42-
)
42+
)[0]
4343

4444
finally:
4545
debug_api.end_debug()
@@ -51,24 +51,24 @@ def test_disable_fp8_gemm(configs_dir, feature_dirs):
5151

5252
assert debug_api.transformer_engine.fp8_gemm_enabled(
5353
"decoder.1.attn.qkv", gemm="fprop", iteration=0
54-
)
54+
)[0]
5555
assert not debug_api.transformer_engine.fp8_gemm_enabled(
5656
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
57-
)
57+
)[0]
5858
assert not debug_api.transformer_engine.fp8_gemm_enabled(
5959
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
60-
)
60+
)[0]
6161

6262
# caching
6363
assert debug_api.transformer_engine.fp8_gemm_enabled(
6464
"decoder.1.attn.qkv", gemm="fprop", iteration=0
65-
)
65+
)[0]
6666
assert not debug_api.transformer_engine.fp8_gemm_enabled(
6767
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
68-
)
68+
)[0]
6969
assert not debug_api.transformer_engine.fp8_gemm_enabled(
7070
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
71-
)
71+
)[0]
7272

7373
finally:
7474
debug_api.end_debug()
@@ -80,22 +80,22 @@ def test_disable_fp8_layer(configs_dir, feature_dirs):
8080

8181
assert debug_api.transformer_engine.fp8_gemm_enabled(
8282
"decoder.1.mlp.fc1", gemm="fprop", iteration=0
83-
)
83+
)[0]
8484
assert debug_api.transformer_engine.fp8_gemm_enabled(
8585
"decoder.1.mlp.fc1", gemm="wgrad", iteration=0
86-
)
86+
)[0]
8787
assert debug_api.transformer_engine.fp8_gemm_enabled(
8888
"decoder.1.mlp.fc1", gemm="dgrad", iteration=0
89-
)
89+
)[0]
9090
assert not debug_api.transformer_engine.fp8_gemm_enabled(
9191
"decoder.1.attn.qkv", gemm="fprop", iteration=0
92-
)
92+
)[0]
9393
assert not debug_api.transformer_engine.fp8_gemm_enabled(
9494
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
95-
)
95+
)[0]
9696
assert not debug_api.transformer_engine.fp8_gemm_enabled(
9797
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
98-
)
98+
)[0]
9999

100100
finally:
101101
debug_api.end_debug()
@@ -111,22 +111,22 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
111111
# check modify_tensor_enabled
112112
assert debug_api.transformer_engine.modify_tensor_enabled(
113113
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
114-
)
114+
)[0]
115115
assert debug_api.transformer_engine.modify_tensor_enabled(
116116
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0
117-
)
117+
)[0]
118118
assert debug_api.transformer_engine.modify_tensor_enabled(
119119
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
120-
)
120+
)[0]
121121
assert not debug_api.transformer_engine.modify_tensor_enabled(
122122
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
123-
)
123+
)[0]
124124
assert not debug_api.transformer_engine.modify_tensor_enabled(
125125
"decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
126-
)
126+
)[0]
127127
assert not debug_api.transformer_engine.modify_tensor_enabled(
128128
"decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
129-
)
129+
)[0]
130130

131131
# check modify_tensor
132132

@@ -168,14 +168,14 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
168168
gemm="wgrad",
169169
tensor_name="gradient",
170170
iteration=0,
171-
)
171+
)[0]
172172

173173
assert not debug_api.transformer_engine.modify_tensor_enabled(
174174
"decoder.1.mlp.fc4",
175175
gemm="fprop",
176176
tensor_name="activation",
177177
iteration=0,
178-
)
178+
)[0]
179179
finally:
180180
debug_api.end_debug()
181181

@@ -191,11 +191,11 @@ def test_fake_quant(configs_dir, feature_dirs):
191191
# modify_tensor_enabled
192192
assert debug_api.transformer_engine.modify_tensor_enabled(
193193
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
194-
)
194+
)[0]
195195

196196
assert debug_api.transformer_engine.modify_tensor_enabled(
197197
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
198-
)
198+
)[0]
199199

200200
# modify_tensor
201201
debug_api.transformer_engine.modify_tensor(
@@ -218,11 +218,11 @@ def test_fake_quant(configs_dir, feature_dirs):
218218

219219
assert debug_api.transformer_engine.fp8_gemm_enabled(
220220
"decoder.1.fc2", gemm="wgrad", iteration=0
221-
)
221+
)[0]
222222
# caching
223223
assert debug_api.transformer_engine.fp8_gemm_enabled(
224224
"decoder.1.fc2", gemm="wgrad", iteration=0
225-
)
225+
)[0]
226226
finally:
227227
debug_api.end_debug()
228228

@@ -265,21 +265,20 @@ def assert_empty():
265265
assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
266266
assert not debug_api.transformer_engine.inspect_tensor_enabled(
267267
"decoder.1.mlp.fc1", tensor_name="activation", iteration=201
268-
)
268+
)[0]
269269
assert not debug_api.transformer_engine.inspect_tensor_enabled(
270270
"decoder.2.mlp.fc1", tensor_name="activation", iteration=200
271-
)
271+
)[0]
272272
assert not debug_api.transformer_engine.inspect_tensor_enabled(
273273
"decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
274-
)
274+
)[0]
275275

276276
expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
277-
expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
278277

279278
# TE FP8 tensor stats --
280279
assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
281280
"decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
282-
)
281+
)[0]
283282
debug_api.transformer_engine.inspect_tensor_postquantize(
284283
"decoder.1.mlp.fc1",
285284
tensor=tensor_fp8,
@@ -295,10 +294,10 @@ def assert_empty():
295294

296295
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
297296
"decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
298-
)
297+
)[0]
299298
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
300299
"decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
301-
)
300+
)[0]
302301

303302
# Second config in same yaml
304303
tensor = torch.rand((100, 100, 5))
@@ -328,7 +327,7 @@ def assert_empty():
328327

329328
assert not debug_api.transformer_engine.inspect_tensor_enabled(
330329
"decoder.7.mlp.fc1", tensor_name="weight", iteration=201
331-
)
330+
)[0]
332331
assert_empty()
333332

334333
finally:
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
test:
2+
enabled: True
3+
layers:
4+
layer_name_regex_pattern: .*
5+
transformer_engine:
6+
LogTensorStats:
7+
enabled: True
8+
tensors_struct:
9+
- tensor: activation
10+
stats: [cur_amax, dynamic_range, mean, std, l1_norm]
11+
start_step: 1
12+
freq: 3
13+
LogFp8TensorStats:
14+
enabled: True
15+
tensors: weight
16+
stats: [underflows%]
17+
start_step: 1
18+
freq: 5
19+
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
test:
2+
enabled: True
3+
layers:
4+
layer_name_regex_pattern: .*1
5+
transformer_engine:
6+
LogTensorStats:
7+
enabled: True
8+
tensors_struct:
9+
- tensor: activation
10+
stats: [cur_amax, dynamic_range, mean, std, l1_norm]
11+
start_step: 0
12+
freq: 100000
13+

tests/pytorch/debug/test_log.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# See LICENSE for license information.
4+
5+
6+
import pytest
7+
import torch
8+
import transformer_engine.pytorch as te
9+
import tempfile
10+
import os
11+
12+
import nvdlfw_inspect.api as debug_api
13+
14+
from transformer_engine.debug.pytorch.debug_state import TEDebugState
15+
16+
17+
@pytest.mark.parametrize("layer", ["linear", "transformer"])
18+
def test_log_every_3_or_5_layers(layer, configs_dir, feature_dirs):
19+
# If layer does not invoke any feature in current iteration,
20+
# then it changed into non-debug mode.
21+
# This test checks whether this works correctly -
22+
# non-quantized statistics should be logged every 3 iterations,
23+
# and quantized statistics should be logged every 5 iterations.
24+
with tempfile.TemporaryDirectory() as temp_dir:
25+
debug_api.initialize(
26+
config_file=configs_dir + "/log_config.yaml",
27+
feature_dirs=feature_dirs,
28+
log_dir=temp_dir,
29+
)
30+
31+
if layer == "linear":
32+
model = te.Linear(128, 128, name="linear1")
33+
elif layer == "transformer":
34+
model = te.TransformerLayer(128, 128, 4, name="transformer1")
35+
else:
36+
raise ValueError(f"Invalid layer: {layer}")
37+
38+
for i in range(11):
39+
x = torch.randn(4, 128, 128).cuda()
40+
with te.fp8_autocast(enabled=True):
41+
y = model(x)
42+
y.sum().backward()
43+
debug_api.step()
44+
45+
with open(
46+
os.path.join(
47+
temp_dir, "nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log"
48+
),
49+
"r",
50+
) as f:
51+
file_content = f.read()
52+
for i in range(1, 11):
53+
if i % 3 == 0 or i % 5 == 0:
54+
assert f"iteration={i:06d}" in file_content
55+
else:
56+
assert f"iteration={i:06d}" not in file_content
57+
58+
debug_api.end_debug()
59+
TEDebugState._reset()

0 commit comments

Comments
 (0)