From 6e2bd60b52321a8abfc3b26539780fa8a238c850 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Tue, 8 Apr 2025 07:22:21 -0700
Subject: [PATCH 1/6] GPUDirect Storage prototype tutorial

---
 .jenkins/validate_tutorials_built.py   |   1 +
 prototype_source/gpu_direct_storage.py | 124 +++++++++++++++++++++++++
 prototype_source/prototype_index.rst   |   8 ++
 3 files changed, 133 insertions(+)
 create mode 100644 prototype_source/gpu_direct_storage.py

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index 3ed1e0c0280..181b1071022 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -31,6 +31,7 @@
     "prototype_source/vmap_recipe",
     "prototype_source/torchscript_freezing",
     "prototype_source/nestedtensor",
+    "prototype_source/gpu_direct_storage", # requires specific filesystem + GPUDirect Storage to be set up
     "recipes_source/recipes/saving_and_loading_models_for_inference",
     "recipes_source/recipes/saving_multiple_models_in_one_file",
     "recipes_source/recipes/tensorboard_with_pytorch",
diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
new file mode 100644
index 00000000000..73f3920d641
--- /dev/null
+++ b/prototype_source/gpu_direct_storage.py
@@ -0,0 +1,124 @@
+"""
+(prototype) Using GPUDirect Storage
+====================================
+
+GPUDirect Storage enabes a direct data path for direct memeory access transfers
+between GPU memory and storage, avoiding a bounce buffer through the CPU.
+
+In version ``2.7``, we introduced some prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
+the `cuFile APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api>`_
+that can be used with ``torch.Tensor``.
+
+In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with
+checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. 
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Understand how to use the ``torch.cuda.gds`` APIs in conjunction with
+         checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem
+    
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch v.2.7.0 or later
+       * GPUDirect Storage must be installed per
+         `the documentation <https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/contents.html>`_
+       * Ensure that the filesystem that you are saving/loading to supports GPUDirect Storage.
+"""
+
+################################################################################
+# Using GPUDirect Storage with ``torch.save`` and ``torch.load``
+# =============================================================
+# GPUDirect Storage requires a storage alignment of 4KB. One can toggle this using
+# ``torch.utils.serialization.config.save.storage_alignment`` to toggle this
+
+import torch
+from torch.utils.serialization import config as serialization_config
+
+serialization_config.save.storage_alignment = 4096
+
+################################################################################
+# Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context
+# manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage``
+# in the state dictionary, space will be reserved within the checkpoint for the storage bytes.
+
+import torch.nn as nn
+
+m = nn.Linear(5, 10, device='cuda')
+sd = m.state_dict()
+
+with torch.serialization.skip_data():
+    torch.save(sd, "checkpoint.pt")
+
+################################################################################
+# We can get the offsets that each storage should be written to within the checkpoint by loading under
+# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (e.g. sizes, strides, dtype, device)
+# information about the tensor but does not have any storage bytes. The following snippet will not materialize
+# any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that
+# corresponds to the tensor.
+
+import os
+from torch._subclasses.fake_tensor import FakeTensorMode
+
+with FakeTensorMode() as mode:
+    fake_sd = torch.load("checkpoint.pt")
+
+for k, v in fake_sd.items():
+    print(f"key={k}, offset={v.untyped_storage()._checkpoint_offset}")
+
+f = torch.cuda.gds.GdsFile("checkpoint.pt", os.O_RDWR)
+
+for k, v in sd.items():
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    f.save_storage(v.untyped_storage(), offset)
+
+################################################################################
+# We verify correctness of the saved checkpoint by ``torch.load`` and comparing.
+
+sd_loaded = torch.load("checkpoint.pt")
+for k, v in sd_loaded.items():
+    assert torch.equal(v, sd[k])
+
+################################################################################
+# The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context
+# manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
+# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). If the
+# tensors to be loaded to are persistent, one can use the ``torch.cuda.gds.gds_register_buffer`` API to register
+# the storages as gds buffers.
+
+with torch.serialization.skip_data():
+    sd_loaded = torch.load("checkpoint.pt")
+
+################################################################################
+# We once again use the ``FakeTensorMode`` to get the checkpoint offsets and
+# ascertain that the loaded checkpoint is the same as the saved checkpoint.
+
+for k, v in sd_loaded.items():
+    assert not torch.equal(v, sd[k])
+    offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    f.load_storage(v.untyped_storage(), offset)
+    assert torch.equal(v, sd[k])
+
+del f
+
+
+################################################################################
+# Buffer Registration
+# ===================
+# We also provide ``torch.cuda.gds.gds_register_buffer`` to register the
+# tensor storages as GPUDirect Storage buffers. See `here
+# <https://docs.nvidia.com/gpudirect-storage/best-practices-guide/index.html#cufile-bufregister-fileread-filewrite>`_
+# for when one should do this.
+
+for v in sd.values():
+    torch.cuda.gds.gds_register_buffer(v.untyped_storage())
+
+# Summary
+# =======
+#
+# In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs
+# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Do
+# file in issue in the PyTorch GitHub repo if you have any feedback.
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
index 5d6a1b5ea9f..c4986681dd6 100644
--- a/prototype_source/prototype_index.rst
+++ b/prototype_source/prototype_index.rst
@@ -268,6 +268,14 @@ Prototype features are not available as part of binary distributions like PyPI o
    :link: ../prototype/python_extension_autoload.html
    :tags: Extending-PyTorch, Frontend-APIs
 
+.. GPUDirect Storage
+.. customcarditem::
+   :header: (prototype) Using GPUDirect Storage
+   :card_description: Learn how to use GPUDirect Storage in PyTorch.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/gpudirect_storage.html
+   :tags: GPUDirect-Storage
+
 .. End of tutorial card section
 
 .. raw:: html

From 8a9fb9adac4b1356d870488fdf499601aa110dad Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Tue, 8 Apr 2025 08:26:40 -0700
Subject: [PATCH 2/6] address comments

---
 prototype_source/gpu_direct_storage.py | 32 ++++++++++++++------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
index 73f3920d641..aeca372f3b6 100644
--- a/prototype_source/gpu_direct_storage.py
+++ b/prototype_source/gpu_direct_storage.py
@@ -41,6 +41,11 @@
 serialization_config.save.storage_alignment = 4096
 
 ################################################################################
+# The steps involved in the process are as follows:
+#    * Write the checkpoint file without any actual data. This reserves the space on disk.
+#    * Read the offsets for the storage associated with each tensor in the checkpoint using ``FakeTensor``.
+#    * Use ``GDSFile`` to write the appropriate data at these offsets.
+# 
 # Given a state dictionary of tensors that are on the GPU, one can use the ``torch.serialization.skip_data`` context
 # manager to save a checkpoint that contains all relevant metadata except the storage bytes. For each ``torch.Storage``
 # in the state dictionary, space will be reserved within the checkpoint for the storage bytes.
@@ -59,6 +64,12 @@
 # information about the tensor but does not have any storage bytes. The following snippet will not materialize
 # any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that
 # corresponds to the tensor.
+# 
+# If you are continuously saving the same state dictionary during training, you
+# would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
+# be loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
+# ``cuFileBufRegister`` to register the storages as gds buffers.
+
 
 import os
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -73,8 +84,10 @@
 
 for k, v in sd.items():
     offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # save_storage is a wrapper around `cuFileWrite`
     f.save_storage(v.untyped_storage(), offset)
 
+
 ################################################################################
 # We verify correctness of the saved checkpoint by ``torch.load`` and comparing.
 
@@ -85,9 +98,7 @@
 ################################################################################
 # The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context
 # manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
-# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``). If the
-# tensors to be loaded to are persistent, one can use the ``torch.cuda.gds.gds_register_buffer`` API to register
-# the storages as gds buffers.
+# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``).
 
 with torch.serialization.skip_data():
     sd_loaded = torch.load("checkpoint.pt")
@@ -99,23 +110,14 @@
 for k, v in sd_loaded.items():
     assert not torch.equal(v, sd[k])
     offset = fake_sd[k].untyped_storage()._checkpoint_offset
+    # load_storage is a wrapper around `cuFileRead`
     f.load_storage(v.untyped_storage(), offset)
+
+for k, v in sd_loaded.items():
     assert torch.equal(v, sd[k])
 
 del f
 
-
-################################################################################
-# Buffer Registration
-# ===================
-# We also provide ``torch.cuda.gds.gds_register_buffer`` to register the
-# tensor storages as GPUDirect Storage buffers. See `here
-# <https://docs.nvidia.com/gpudirect-storage/best-practices-guide/index.html#cufile-bufregister-fileread-filewrite>`_
-# for when one should do this.
-
-for v in sd.values():
-    torch.cuda.gds.gds_register_buffer(v.untyped_storage())
-
 # Summary
 # =======
 #

From a1bc4fc6165e131990b95c1852789969cf2c4aa2 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Tue, 8 Apr 2025 08:30:07 -0700
Subject: [PATCH 3/6] one more fix

---
 prototype_source/gpu_direct_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
index aeca372f3b6..5a6757e6ae7 100644
--- a/prototype_source/gpu_direct_storage.py
+++ b/prototype_source/gpu_direct_storage.py
@@ -67,7 +67,7 @@
 # 
 # If you are continuously saving the same state dictionary during training, you
 # would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
-# be loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
+# be saved or loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
 # ``cuFileBufRegister`` to register the storages as gds buffers.
 
 

From 85c0c94c76d5faf566c2d7d667db5e5aa5bfefef Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Mon, 14 Apr 2025 14:21:48 -0700
Subject: [PATCH 4/6] address comments

---
 prototype_source/gpu_direct_storage.py | 40 +++++++++++++++-----------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
index 5a6757e6ae7..c5475f8fb5c 100644
--- a/prototype_source/gpu_direct_storage.py
+++ b/prototype_source/gpu_direct_storage.py
@@ -1,13 +1,13 @@
 """
-(prototype) Using GPUDirect Storage
-====================================
+(prototype) Accelerating ``torch.save`` and ``torch.load`` with GPUDirect Storage
+=================================================================================
 
-GPUDirect Storage enabes a direct data path for direct memeory access transfers
+GPUDirect Storage enables a direct data path for direct memory access transfers
 between GPU memory and storage, avoiding a bounce buffer through the CPU.
 
-In version ``2.7``, we introduced some prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
+In version **2.7**, we introduced new prototype APIs to ``torch.cuda.gds`` that serve as thin wrappers around
 the `cuFile APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api>`_
-that can be used with ``torch.Tensor``.
+that can be used with ``torch.Tensor`` to achieve improved I/O performance.
 
 In this tutorial, we will demonstrate how to use the ``torch.cuda.gds`` APIs in conjunction with
 checkpoints generated by ``torch.save`` and ``torch.load`` on local filesystem. 
@@ -32,8 +32,8 @@
 ################################################################################
 # Using GPUDirect Storage with ``torch.save`` and ``torch.load``
 # =============================================================
-# GPUDirect Storage requires a storage alignment of 4KB. One can toggle this using
-# ``torch.utils.serialization.config.save.storage_alignment`` to toggle this
+# GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using
+# ``torch.utils.serialization.config.save.storage_alignment``:
 
 import torch
 from torch.utils.serialization import config as serialization_config
@@ -60,15 +60,18 @@
 
 ################################################################################
 # We can get the offsets that each storage should be written to within the checkpoint by loading under
-# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (e.g. sizes, strides, dtype, device)
+# a ``FakeTensorMode``. A FakeTensor is a tensor that has metadata (such as sizes, strides, dtype, device)
 # information about the tensor but does not have any storage bytes. The following snippet will not materialize
-# any data but which will tag each ``FakeTensor`` with the offset within the checkpoint that
+# any data but will tag each ``FakeTensor`` with the offset within the checkpoint that
 # corresponds to the tensor.
 # 
 # If you are continuously saving the same state dictionary during training, you
 # would only need to obtain the offsets once and the same offsets can be re-used. Similarly if tensor is going to
-# be saved or loaded to repeatedly one can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
-# ``cuFileBufRegister`` to register the storages as gds buffers.
+# be saved or loaded to repeatedly you can use the ``torch.cuda.gds.gds_register_buffer`` which wraps
+# ``cuFileBufRegister`` to register the storages as GDS buffers.
+#
+# Note that ``torch.cuda.gds.GdsFile.save_storage`` binds to the synchronous ``cuFileWrite`` API,
+# so no synchronization is needed afterwards.
 
 
 import os
@@ -96,9 +99,9 @@
     assert torch.equal(v, sd[k])
 
 ################################################################################
-# The loading flow is the inverse, we can ``torch.load`` under the ``torch.serialization.skip_data`` context
+# The loading flow is the inverse: you can use ``torch.load`` with the ``torch.serialization.skip_data`` context
 # manager to load everything except the storage bytes. This means that any tensors in the checkpoint will be
-# created but their storages will be empty (i.e. the tensors will be created via ``torch.empty``).
+# created but their storages will be empty (as if the tensors were created via ``torch.empty``).
 
 with torch.serialization.skip_data():
     sd_loaded = torch.load("checkpoint.pt")
@@ -106,6 +109,9 @@
 ################################################################################
 # We once again use the ``FakeTensorMode`` to get the checkpoint offsets and
 # ascertain that the loaded checkpoint is the same as the saved checkpoint.
+#
+# Similar to  ``torch.cuda.gds.GdsFile.save_storage``, ``torch.cuda.gds.GdsFile.load_storage``
+# binds to the synchronous ``cuFileRead`` API, so no synchronization is needed afterwards.
 
 for k, v in sd_loaded.items():
     assert not torch.equal(v, sd[k])
@@ -118,9 +124,9 @@
 
 del f
 
-# Summary
-# =======
+# Conclusion
+# ==========
 #
 # In this tutorial we have demonstrated how to use the prototype ``torch.cuda.gds`` APIs
-# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Do
-# file in issue in the PyTorch GitHub repo if you have any feedback.
+# in conjunction with ``torch.save`` and ``torch.load`` on local filesystem. Please
+# file an issue in the PyTorch GitHub repo if you have any feedback.

From dfa94ada80198e933895a70bb5192fd5a5c38734 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Wed, 16 Apr 2025 13:32:12 -0700
Subject: [PATCH 5/6] Fix formatting

---
 prototype_source/gpu_direct_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
index c5475f8fb5c..b02ee02261c 100644
--- a/prototype_source/gpu_direct_storage.py
+++ b/prototype_source/gpu_direct_storage.py
@@ -123,7 +123,7 @@
     assert torch.equal(v, sd[k])
 
 del f
-
+##########################################################
 # Conclusion
 # ==========
 #

From 30082cf13ac8072c680733f980e6f9cdab0b0327 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 28 Apr 2025 15:06:24 -0700
Subject: [PATCH 6/6] Update prototype_source/gpu_direct_storage.py

---
 prototype_source/gpu_direct_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prototype_source/gpu_direct_storage.py b/prototype_source/gpu_direct_storage.py
index b02ee02261c..2b06c53bc7f 100644
--- a/prototype_source/gpu_direct_storage.py
+++ b/prototype_source/gpu_direct_storage.py
@@ -31,7 +31,7 @@
 
 ################################################################################
 # Using GPUDirect Storage with ``torch.save`` and ``torch.load``
-# =============================================================
+# ------------------------------------------------------------------------------------
 # GPUDirect Storage requires a storage alignment of 4KB. You can toggle this by using
 # ``torch.utils.serialization.config.save.storage_alignment``: