Skip to content

Commit 007b4bf

Browse files
authored
Upgrade perf_run script to support TRT 10 and fix some issues (#3650)
1 parent 6a56d63 commit 007b4bf

File tree

6 files changed

+142
-63
lines changed

6 files changed

+142
-63
lines changed

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import logging
44
from contextlib import nullcontext
5-
from tempfile import tempdir
65
from typing import Any, Dict, List, Optional, Sequence, Tuple
76

87
import tensorrt as trt
@@ -11,6 +10,7 @@
1110
from torch.nn import Module
1211
from torch_tensorrt._Device import Device
1312
from torch_tensorrt._enums import Platform, dtype
13+
from torch_tensorrt.dynamo._defaults import DEBUG_LOGGING_DIR
1414
from torch_tensorrt.dynamo._settings import CompilationSettings
1515
from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
1616
from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger
@@ -535,12 +535,9 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
535535
)
536536

537537
if self.profiling_enabled:
538-
import tempfile
539-
540-
with tempfile.TemporaryDirectory() as tmpdir:
541-
self.cudagraph.debug_dump(
542-
f"{tempdir}/{self.name}_cudagraph.dot"
543-
)
538+
self.cudagraph.debug_dump(
539+
f"{DEBUG_LOGGING_DIR}/{self.name}_cudagraph.dot"
540+
)
544541

545542
self.cudagraph.replay() # type: ignore
546543

tools/perf/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ This is a comprehensive Python benchmark suite to run perf runs using different
99
5. TensorRT
1010

1111

12-
Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.
13-
1412
## Prerequisite
1513

1614
Benchmark scripts depends on following Python packages in addition to requirements.txt packages
@@ -47,13 +45,15 @@ Here are the list of `CompileSpec` options that can be provided directly to comp
4745
* `--backends` : Comma separated string of backends. Eg: torch, torch_compile, dynamo, tensorrt
4846
* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `dynamo` or `torch_compile`, the input should be a Pytorch module (instead of a torchscript module).
4947
* `--model_torch` : Name of the PyTorch model file (optional, only necessary if `dynamo` or `torch_compile` is a chosen backend)
48+
* `--onnx` : ONNX model file which helps bypass the step of exporting ONNX from `model_torch`. If this argument is provided, the ONNX will be directly converted to TRT engine
5049
* `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT
5150
* `--batch_size` : Batch size
5251
* `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16
5352
* `--device` : Device ID
5453
* `--truncate` : Truncate long and double weights in the network in Torch-TensorRT
5554
* `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine.
5655
* `--report` : Path of the output file where performance summary is written.
56+
* `--optimization_level` : Builder optimization level for TensorRT (from 1 to 5, 5 is the highest optimization).
5757

5858
Eg:
5959

tools/perf/custom_models.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,17 @@ def StableDiffusionUnet():
3333
"CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
3434
)
3535
return pipe.unet
36+
37+
38+
def UNet():
39+
from monai.networks.nets import UNet
40+
41+
model = UNet(
42+
spatial_dims=2,
43+
in_channels=32,
44+
out_channels=32,
45+
channels=(4, 8, 16),
46+
strides=(2, 2),
47+
num_res_units=2,
48+
)
49+
return model.eval().cuda()

tools/perf/perf_run.py

Lines changed: 111 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,7 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size):
174174
compile_settings = {
175175
"inputs": input_tensors,
176176
"enabled_precisions": {precision_to_dtype(precision)},
177-
"truncate_long_and_double": params.get("truncate", False),
178-
"use_python_runtime": params.get("use_python_runtime", False),
177+
"truncate_double": params.get("truncate", False),
179178
}
180179

181180
if precision == "int8":
@@ -274,8 +273,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
274273
ir="dynamo",
275274
enabled_precisions={precision_to_dtype(precision)},
276275
min_block_size=params.get("min_block_size", 1),
277-
debug=False,
278-
truncate_long_and_double=params.get("truncate", False),
276+
truncate_double=params.get("truncate", False),
279277
immutable_weights=params.get("immutable_weights", True),
280278
strip_engine_weights=params.get("strip_engine_weights", False),
281279
refit_identical_engine_weights=params.get(
@@ -284,6 +282,7 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
284282
cache_built_engines=params.get("cache_built_engines", False),
285283
reuse_cached_engines=params.get("reuse_cached_engines", False),
286284
use_python_runtime=params.get("use_python_runtime", False),
285+
optimization_level=params.get("optimization_level", 3),
287286
)
288287
end_compile = timeit.default_timer()
289288
compile_time_s = end_compile - start_compile
@@ -437,61 +436,106 @@ def run_tensorrt(
437436
precision,
438437
batch_size=1,
439438
):
440-
# Export an ONNX model and convert to TRT
441-
torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx")
442439
logger = trt.Logger(trt.Logger.WARNING)
443-
builder = trt.Builder(logger)
444-
network = builder.create_network(
445-
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
446-
)
447-
parser = trt.OnnxParser(network, logger)
448-
success = parser.parse_from_file("./tmp.onnx")
449-
if not success:
450-
raise ValueError("ONNX conversion failed")
451-
452-
config = builder.create_builder_config()
453-
if precision == "fp16":
454-
config.set_flag(trt.BuilderFlag.FP16)
455-
start_compile = timeit.default_timer()
456-
serialized_engine = builder.build_serialized_network(network, config)
457-
end_compile = timeit.default_timer()
458-
compile_time_s = end_compile - start_compile
440+
compile_time_s = 0
441+
if params["is_trt_engine"]:
442+
serialized_engine = model
443+
else:
444+
if params["onnx"]:
445+
onnx_path = params["onnx"]
446+
else:
447+
onnx_path = "./onnx-trt.onnx"
448+
torch.onnx.export(model, tuple(input_tensors), onnx_path, dynamo=True)
449+
builder = trt.Builder(logger)
450+
network = builder.create_network(
451+
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
452+
)
453+
parser = trt.OnnxParser(network, logger)
454+
success = parser.parse_from_file(onnx_path)
455+
if not success:
456+
raise ValueError("ONNX conversion failed")
457+
458+
config = builder.create_builder_config()
459+
if precision == "fp16":
460+
config.set_flag(trt.BuilderFlag.FP16)
461+
config.builder_optimization_level = params.get("optimization_level", 3)
462+
start_compile = timeit.default_timer()
463+
serialized_engine = builder.build_serialized_network(network, config)
464+
end_compile = timeit.default_timer()
465+
compile_time_s = end_compile - start_compile
459466
# Deserialize the TensorRT engine
460467
with trt.Runtime(logger) as runtime:
461468
engine = runtime.deserialize_cuda_engine(serialized_engine)
462469

463470
print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size)
464471
iters = params.get("iterations", 20)
465472

466-
# Compiling the bindings
467-
bindings = engine.num_bindings * [None]
468-
k = 0
469-
for idx, _ in enumerate(bindings):
470-
dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))
471-
shape = tuple(engine.get_binding_shape(idx))
472-
device = torch_device_from_trt(engine.get_location(idx))
473-
if not engine.binding_is_input(idx):
474-
# Output bindings
475-
output = torch.empty(size=shape, dtype=dtype, device=device)
476-
bindings[idx] = output.data_ptr()
477-
else:
478-
# Input bindings
479-
bindings[idx] = input_tensors[k].data_ptr()
480-
k += 1
473+
start_time = timeit.default_timer()
474+
# Get I/O tensor information using TensorRT 10 API
475+
input_names = []
476+
output_names = []
477+
output_dtypes = []
478+
output_shapes = []
479+
480+
for i in range(engine.num_io_tensors):
481+
tensor_name = engine.get_tensor_name(i)
482+
tensor_mode = engine.get_tensor_mode(tensor_name)
483+
tensor_dtype = engine.get_tensor_dtype(tensor_name)
484+
tensor_shape = engine.get_tensor_shape(tensor_name)
485+
486+
if tensor_mode == trt.TensorIOMode.INPUT:
487+
input_names.append(tensor_name)
488+
else: # trt.TensorIOMode.OUTPUT
489+
output_names.append(tensor_name)
490+
output_dtypes.append(torch_dtype_from_trt(tensor_dtype))
491+
output_shapes.append(tuple(tensor_shape))
492+
493+
# Create output tensors
494+
output_tensors = []
495+
for i, (shape, dtype) in enumerate(zip(output_shapes, output_dtypes)):
496+
output = torch.empty(size=shape, dtype=dtype, device="cuda")
497+
output_tensors.append(output)
481498

482499
timings = []
483500
with engine.create_execution_context() as context:
501+
# Set input tensor addresses
502+
for i, (input_name, input_tensor) in enumerate(zip(input_names, input_tensors)):
503+
context.set_tensor_address(input_name, input_tensor.data_ptr())
504+
505+
# Set output tensor addresses
506+
for output_name, output_tensor in zip(output_names, output_tensors):
507+
context.set_tensor_address(output_name, output_tensor.data_ptr())
508+
509+
# Create a dedicated stream for TensorRT execution
510+
dedicated_stream = torch.cuda.Stream()
511+
current_stream = torch.cuda.current_stream()
512+
513+
setup_time = timeit.default_timer()
514+
515+
# Warm up
484516
for i in range(WARMUP_ITER):
485-
context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
517+
# Wait for current stream to finish
518+
dedicated_stream.wait_stream(current_stream)
519+
context.execute_async_v3(dedicated_stream.cuda_stream)
520+
# Wait for TensorRT stream to finish
521+
current_stream.wait_stream(dedicated_stream)
486522
torch.cuda.synchronize()
487523

524+
infer_start_time = timeit.default_timer()
525+
# Performance measurement
488526
for i in range(iters):
489-
start_time = timeit.default_timer()
490-
context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
527+
# Wait for current stream to finish
528+
dedicated_stream.wait_stream(current_stream)
529+
context.execute_async_v3(dedicated_stream.cuda_stream)
530+
# Wait for TensorRT stream to finish
531+
current_stream.wait_stream(dedicated_stream)
491532
torch.cuda.synchronize()
492-
end_time = timeit.default_timer()
493-
meas_time = end_time - start_time
494-
timings.append(meas_time)
533+
534+
end_time = timeit.default_timer()
535+
536+
# to compare against torch-trt dynamo apples to apples
537+
infer_time = (end_time - infer_start_time + setup_time - start_time) / iters
538+
timings.append(infer_time)
495539

496540
recordStats("TensorRT", timings, precision, batch_size, compile_time_s)
497541

@@ -504,7 +548,6 @@ def run(
504548
params,
505549
precision,
506550
batch_size=1,
507-
is_trt_engine=False,
508551
model_torch=None,
509552
):
510553
for backend in backends:
@@ -523,7 +566,7 @@ def run(
523566
print("int8 precision expects calibration cache file for inference")
524567
return False
525568

526-
if (model is None) and (backend in ("tensorrt", "ts_trt", "all")):
569+
if (model is None) and (backend in ("ts_trt", "all")):
527570
warnings.warn(
528571
f"Requested backend {backend} without specifying a TorchScript Model, "
529572
+ "skipping this backend"
@@ -547,11 +590,10 @@ def run(
547590
batch_size,
548591
)
549592
run_tensorrt(
550-
model,
593+
model_torch,
551594
input_tensors,
552595
params,
553596
precision,
554-
is_trt_engine,
555597
batch_size,
556598
)
557599
run_dynamo(model_torch, input_tensors, params, precision, batch_size)
@@ -604,6 +646,12 @@ def run(
604646
default="",
605647
help="Name of torch model file",
606648
)
649+
arg_parser.add_argument(
650+
"--onnx",
651+
type=str,
652+
default="",
653+
help="ONNX model file which helps bypass the step of exporting ONNX from torchscript model. If this argument is provided, the ONNX will be directly converted to TRT engine",
654+
)
607655
arg_parser.add_argument(
608656
"--inputs",
609657
type=str,
@@ -643,6 +691,12 @@ def run(
643691
action="store_true",
644692
help="Truncate long and double weights in the network in Torch-TensorRT",
645693
)
694+
arg_parser.add_argument(
695+
"--optimization_level",
696+
type=int,
697+
default=3,
698+
help="Builder optimization level for TensorRT",
699+
)
646700
arg_parser.add_argument(
647701
"--is_trt_engine",
648702
action="store_true",
@@ -702,8 +756,13 @@ def run(
702756

703757
# Load TorchScript model, if provided
704758
if os.path.exists(model_name):
705-
print("Loading user provided torchscript model: ", model_name)
706-
model = torch.jit.load(model_name).cuda().eval()
759+
if params["is_trt_engine"]:
760+
with open(model_name, "rb") as f:
761+
model = f.read()
762+
print("Loading user provided trt engine: ", model_name)
763+
else:
764+
print("Loading user provided torchscript model: ", model_name)
765+
model = torch.jit.load(model_name).cuda().eval()
707766

708767
# Load PyTorch Model, if provided
709768
if len(model_name_torch) > 0 and os.path.exists(model_name_torch):
@@ -719,7 +778,9 @@ def run(
719778
)
720779

721780
backends = parse_backends(params["backends"])
722-
if ("dynamo" in backends or "torch_compile" in backends) and (model_torch is None):
781+
if any(
782+
backend in ["dynamo", "torch_compile", "tensorrt"] for backend in backends
783+
) and (model_torch is None):
723784
raise ValueError(
724785
"No Pytorch model (nn.Module) is provided for torchdynamo compilation. Please provide a pytorch model using --model_torch argument"
725786
)
@@ -746,7 +807,6 @@ def run(
746807
params,
747808
precision,
748809
batch_size,
749-
is_trt_engine,
750810
model_torch=model_torch,
751811
)
752812

tools/perf/requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ argparse
33
pyyaml
44
onnx
55
pandas
6-
transformers
7-
diffusers==0.21.4
6+
transformers==4.51.3
7+
diffusers==0.34.0
88
timm==0.9.8
9-
9+
monai==1.5.0

tools/perf/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"apple/DCLM-7B",
2626
"mistralai/Mistral-7B-Instruct-v0.3",
2727
"microsoft/Phi-3-mini-4k-instruct",
28+
"monai/unet",
2829
}
2930

3031

@@ -108,6 +109,11 @@ def __getitem__(self, name: str):
108109
"model": hf_artifact["model"],
109110
"path": "pytorch",
110111
}
112+
elif name == "monai/unet":
113+
return {
114+
"model": cm.UNet(),
115+
"path": "pytorch",
116+
}
111117
else:
112118
raise AssertionError(f"Invalid model name {name}")
113119

@@ -176,6 +182,8 @@ def torch_dtype_from_trt(dtype):
176182
return torch.bool
177183
elif dtype == trt.int32:
178184
return torch.int32
185+
elif dtype == trt.int64:
186+
return torch.int64
179187
elif dtype == trt.float16:
180188
return torch.float16
181189
elif dtype == trt.float32:

0 commit comments

Comments
 (0)