From 255da510f8b160c4b580f75f170e97feed6b084f Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 8 Apr 2025 07:22:21 -0700 Subject: [PATCH 1/5] GPUDirect Storage prototype tutorial --- .jenkins/validate_tutorials_built.py | 1 + prototype_source/gpu_direct_storage.py | 124 +++++++++++++++++++++++++ prototype_source/prototype_index.rst | 8 ++ 3 files changed, 133 insertions(+) create mode 100644 prototype_source/gpu_direct_storage.py diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index f78ec11e1aa..bf5e03889ba 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -31,6 +31,7 @@ "prototype_source/vmap_recipe", "prototype_source/torchscript_freezing", "prototype_source/nestedtensor", + "prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up "recipes_source/recipes/saving_and_loading_models_for_inference", "recipes_source/recipes/saving_multiple_models_in_one_file", "recipes_source/recipes/tensorboard_with_pytorch", diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py new file mode 100644 index 00000000000..73f3920d641 --- /dev/null +++ b/prototype_source/gpu_direct_storage.py @@ -0,0 +1,124 @@ +""" +(prototype) Using GPUDirect Storage +==================================== + +GPUDirect Storage enabes a direct data path for direct memeory access transfers +between GPU memory and storage, avoiding a bounce buffer through the CPU. + +In version ``2.7``, we introduced some prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around +the `cuFile APIs `_ +that can be used with ``torch.Tensor``. + +In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with +checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Understand how to use the ``torch.cuda.gds`` APIs in conjunction with + checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v.2.7.0 or later + * GPUDirect Storage must be installed per + `the documentation `_ + * Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage. +""" + +################################################################################ +# Using GPUDirect Storage with ``torch.save`` and ``torch.load`` +# ============================================================= +# GPUDirect Storage requires a storage alignment of 4KB. One can toggle this using +# ``torch.utils.serialization.config.save.storage_alignment`` to toggle this + +import torch +from torch.utils.serialization import config as serialization_config + +serialization_config.save.storage_alignment = 4096 + +################################################################################ +# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context +# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage`` +# in the state dictionary, space will be reserved within the checkpoint for the storage bytes. + +import torch.nn as nn + +m = nn.Linear(5, 10, device='cuda') +sd = m.state_dict() + +with torch.serialization.skip_data(): + torch.save(sd, "checkpoint.pt") + +################################################################################ +# We can get the offsets that each storage should be written to within the checkpoint by loading under +# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (e.g. sizes, strides, dtype, device) +# information about the tensor but does not have any storage bytes. The following snippet will not materialize +# any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that +# corresponds to the tensor. + +import os +from torch._subclasses.fake_tensor import FakeTensorMode + +with FakeTensorMode() as mode: + fake_sd = torch.load("checkpoint.pt") + +for k, v in fake_sd.items(): + print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}") + +f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR) + +for k, v in sd.items(): + offset = fake_sd[k].untyped_storage()._checkpoint_offset + f.save_storage(v.untyped_storage(), offset) + +################################################################################ +# We verify correctness of the saved checkpoint by ``torch.load`` and comparing. + +sd_loaded = torch.load("checkpoint.pt") +for k, v in sd_loaded.items(): + assert torch.equal(v, sd[k]) + +################################################################################ +# The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context +# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be +# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). If the +# tensors to be loaded to are persistent, one can use the ``torch.cuda.gds.gds_register_buffer`` API to register +# the storages as gds buffers. + +with torch.serialization.skip_data(): + sd_loaded = torch.load("checkpoint.pt") + +################################################################################ +# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and +# ascertain that the loaded checkpoint is the same as the saved checkpoint. + +for k, v in sd_loaded.items(): + assert not torch.equal(v, sd[k]) + offset = fake_sd[k].untyped_storage()._checkpoint_offset + f.load_storage(v.untyped_storage(), offset) + assert torch.equal(v, sd[k]) + +del f + + +################################################################################ +# Buffer Registration +# =================== +# We also provide ``torch.cuda.gds.gds_register_buffer`` to register the +# tensor storages as GPUDirect Storage buffers. See `here +# `_ +# for when one should do this. + +for v in sd.values(): + torch.cuda.gds.gds_register_buffer(v.untyped_storage()) + +# Summary +# ======= +# +# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs +# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Do +# file in issue in the PyTorch GitHub repo if you have any feedback. diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst index 927f5f694b8..ffa51dedb4d 100644 --- a/prototype_source/prototype_index.rst +++ b/prototype_source/prototype_index.rst @@ -247,6 +247,14 @@ Prototype features are not available as part of binary distributions like PyPI o :link: ../prototype/python_extension_autoload.html :tags: Extending-PyTorch, Frontend-APIs +.. GPUDirect Storage +.. customcarditem:: + :header: (prototype) Using GPUDirect Storage + :card_description: Learn how to use GPUDirect Storage in PyTorch. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/gpudirect_storage.html + :tags: GPUDirect-Storage + .. End of tutorial card section .. raw:: html From ba7f4e896969df04ab092d3905cb9aebae09ddca Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 8 Apr 2025 08:26:40 -0700 Subject: [PATCH 2/5] address comments --- prototype_source/gpu_direct_storage.py | 32 ++++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py index 73f3920d641..aeca372f3b6 100644 --- a/prototype_source/gpu_direct_storage.py +++ b/prototype_source/gpu_direct_storage.py @@ -41,6 +41,11 @@ serialization_config.save.storage_alignment = 4096 ################################################################################ +# The steps involved in the process are as follows: +# * Write the checkpoint file without any actual data. This reserves the space on disk. +# * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``. +# * Use ``GDSFile`` to write the appropriate data at these offsets. +# # Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context # manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage`` # in the state dictionary, space will be reserved within the checkpoint for the storage bytes. @@ -59,6 +64,12 @@ # information about the tensor but does not have any storage bytes. The following snippet will not materialize # any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that # corresponds to the tensor. +# +# If you are continuously saving the same state dictionary during training, you +# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to +# be loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps +# ``cuFileBufRegister`` to register the storages as gds buffers. + import os from torch._subclasses.fake_tensor import FakeTensorMode @@ -73,8 +84,10 @@ for k, v in sd.items(): offset = fake_sd[k].untyped_storage()._checkpoint_offset + # save_storage is a wrapper around `cuFileWrite` f.save_storage(v.untyped_storage(), offset) + ################################################################################ # We verify correctness of the saved checkpoint by ``torch.load`` and comparing. @@ -85,9 +98,7 @@ ################################################################################ # The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context # manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be -# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). If the -# tensors to be loaded to are persistent, one can use the ``torch.cuda.gds.gds_register_buffer`` API to register -# the storages as gds buffers. +# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). with torch.serialization.skip_data(): sd_loaded = torch.load("checkpoint.pt") @@ -99,23 +110,14 @@ for k, v in sd_loaded.items(): assert not torch.equal(v, sd[k]) offset = fake_sd[k].untyped_storage()._checkpoint_offset + # load_storage is a wrapper around `cuFileRead` f.load_storage(v.untyped_storage(), offset) + +for k, v in sd_loaded.items(): assert torch.equal(v, sd[k]) del f - -################################################################################ -# Buffer Registration -# =================== -# We also provide ``torch.cuda.gds.gds_register_buffer`` to register the -# tensor storages as GPUDirect Storage buffers. See `here -# `_ -# for when one should do this. - -for v in sd.values(): - torch.cuda.gds.gds_register_buffer(v.untyped_storage()) - # Summary # ======= # From a5f98f1f0bff5787e866ec93e70ad5a4468fabcd Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 8 Apr 2025 08:30:07 -0700 Subject: [PATCH 3/5] one more fix --- prototype_source/gpu_direct_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py index aeca372f3b6..5a6757e6ae7 100644 --- a/prototype_source/gpu_direct_storage.py +++ b/prototype_source/gpu_direct_storage.py @@ -67,7 +67,7 @@ # # If you are continuously saving the same state dictionary during training, you # would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to -# be loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps +# be saved or loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps # ``cuFileBufRegister`` to register the storages as gds buffers. From c4c45c6a9bcb014a840756ece541aa298017328d Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Mon, 14 Apr 2025 14:21:48 -0700 Subject: [PATCH 4/5] address comments --- prototype_source/gpu_direct_storage.py | 40 +++++++++++++++----------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py index 5a6757e6ae7..c5475f8fb5c 100644 --- a/prototype_source/gpu_direct_storage.py +++ b/prototype_source/gpu_direct_storage.py @@ -1,13 +1,13 @@ """ -(prototype) Using GPUDirect Storage -==================================== +(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage +================================================================================= -GPUDirect Storage enabes a direct data path for direct memeory access transfers +GPUDirect Storage enables a direct data path for direct memory access transfers between GPU memory and storage, avoiding a bounce buffer through the CPU. -In version ``2.7``, we introduced some prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around +In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around the `cuFile APIs `_ -that can be used with ``torch.Tensor``. +that can be used with ``torch.Tensor`` to achieve improved I/O performance. In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. @@ -32,8 +32,8 @@ ################################################################################ # Using GPUDirect Storage with ``torch.save`` and ``torch.load`` # ============================================================= -# GPUDirect Storage requires a storage alignment of 4KB. One can toggle this using -# ``torch.utils.serialization.config.save.storage_alignment`` to toggle this +# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using +# ``torch.utils.serialization.config.save.storage_alignment``: import torch from torch.utils.serialization import config as serialization_config @@ -60,15 +60,18 @@ ################################################################################ # We can get the offsets that each storage should be written to within the checkpoint by loading under -# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (e.g. sizes, strides, dtype, device) +# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device) # information about the tensor but does not have any storage bytes. The following snippet will not materialize -# any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that +# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that # corresponds to the tensor. # # If you are continuously saving the same state dictionary during training, you # would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to -# be saved or loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps -# ``cuFileBufRegister`` to register the storages as gds buffers. +# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps +# ``cuFileBufRegister`` to register the storages as GDS buffers. +# +# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API, +# so no synchronization is needed afterwards. import os @@ -96,9 +99,9 @@ assert torch.equal(v, sd[k]) ################################################################################ -# The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context +# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context # manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be -# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). +# created but their storages will be empty (as if the tensors were created via ``torch.empty``). with torch.serialization.skip_data(): sd_loaded = torch.load("checkpoint.pt") @@ -106,6 +109,9 @@ ################################################################################ # We once again use the ``FakeTensorMode`` to get the checkpoint offsets and # ascertain that the loaded checkpoint is the same as the saved checkpoint. +# +# Similar to ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage`` +# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards. for k, v in sd_loaded.items(): assert not torch.equal(v, sd[k]) @@ -118,9 +124,9 @@ del f -# Summary -# ======= +# Conclusion +# ========== # # In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs -# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Do -# file in issue in the PyTorch GitHub repo if you have any feedback. +# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please +# file an issue in the PyTorch GitHub repo if you have any feedback. From 7d9de66859a020206387614c40ef90fa2d230d2c Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Wed, 16 Apr 2025 13:32:12 -0700 Subject: [PATCH 5/5] Fix formatting --- prototype_source/gpu_direct_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py index c5475f8fb5c..b02ee02261c 100644 --- a/prototype_source/gpu_direct_storage.py +++ b/prototype_source/gpu_direct_storage.py @@ -123,7 +123,7 @@ assert torch.equal(v, sd[k]) del f - +########################################################## # Conclusion # ========== #