Update to include WorkStream proposal, move version to API chain

tqchen · tqchen · commit 4b1de24f3ac0 · 2025-09-30T10:42:55.000-04:00
diff --git a/include/dlpack/dlpack.h b/include/dlpack/dlpack.h
@@ -369,6 +369,7 @@ typedef struct DLManagedTensorVersioned {
  * \brief A generic C-style allocator that exposes allocation of a Tensor/Array.
  *
  * This information can then be used to set allocators of a callee to run allocations.
+ * This information can then be used to set the callee's allocator to perform allocations.
  * This function can be exposed by the framework through the DLPackExchangeAPI.
  *
  * This particular function does not assume a Python environment; as a result,
@@ -394,44 +395,66 @@ typedef int (*DLPackManagedTensorAllocator)(                                //
  * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
  *
  * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
- * to a DLManagedTensorVersioned without going through the Python Interpreter.
+ * to a DLManagedTensorVersioned without going through the Python interpreter.
  *
  * It also provides an option to query the current context stream of the device provided
  * by the tensor.
  *
  * This function is exposed by the framework through the DLPackExchangeAPI.
  *
- * This information can then be picked up by importers and libraries to run the speed conversion.
+ * This information can then be picked up by importers and libraries to perform a fast conversion.
  * This function should not throw any exceptions; if it fails, it should return -1 and
  * set the error message via PyErr_SetXXX.
  *
  * \param py_object The Python object to convert; this should be PyObject*.
  *                  We use void* to avoid dependency on Python.h.
  *
- * \param max_version The maximum version of DLPack support that consumer supports.
- *                    Consumer should fill in their own version here, this parameter is not null.
- *                    Producer can use this information to produce the appropriate
- *                    DLManagedTensorVersioned for maximum compatibility if needed.
- *                    This field is primarily used for future compatibility in case
- *                    of major version bump and ABI-breaking changes.
- *
  * \param out The output DLManagedTensorVersioned.
  *
- * \param optional_out_env_stream Outputs the current context stream of the device provided
- *                   by the tensor; it can be NULL, in which case the stream will not be queried.
- *                   optional_out_env_stream should points to cudaStream_t in the case of CUDA.
+ * \param optional_out_last_active_stream Outputs the current stream the tensor is synced to.
+ *   It can be NULL, in which case the stream will not be queried.
+ *   optional_out_last_active_stream should point to cudaStream_t in the case of CUDA.
+ *   Note that for frameworks that use a stream context manager, optional_out_last_active_stream
+ *   can be the stream that the context manager was most recently active on.
+ *   The stream is owned by the producer, and the consumer cannot retain it.
+ *   Instead, the consumer can record an event or add wait dependencies to it.
+ *   It is the responsibility of the consumer to synchronize with the stream if necessary.
+ *   The producer may output `reinterpret_cast<void*>(-1)` to indicate that the last active stream
+ *   is not available; in such a case, a device sync is needed to ensure data is ready.
  *
  * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
  * \note We use void* to avoid dependency on Python.h, so this specific type is
  *       not dependent on Python.h and can be copied to dlpack.h.
  *
- * \sa DLPackExchangeAPI
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
  */
 typedef int (*DLPackManagedTensorFromPyObject)(                 //
   void* py_object,                                              //
-  const DLPackVersion* max_version,                             //
   DLManagedTensorVersioned** out,                               //
-  void** optional_out_env_stream                                //
+  void** optional_out_last_active_stream                        //
+);
+
+/*!
+ * \brief Obtain the current work stream of a device.
+ *
+ * This function is a C-style function pointer to obtain the current work stream of a device
+ * for frameworks that rely on a context manager to manage the stream.
+ * For example, it should map to torch.cuda.current_stream in PyTorch.
+ *
+ * This function can be set to NULL if the framework does not rely on a context manager to
+ * manage the stream.
+ *
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param optional_out_current_stream The output current work stream.
+ * \return 0 on success, -1 on failure.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackCurrentWorkStream)(                         //
+  DLDevice device_type,                                         //
+  DLDevice device_id,                                           //
+  void** optional_out_current_stream                            //
 );
 
 /*!
@@ -457,13 +480,36 @@ typedef int (*DLPackManagedTensorToPyObject)(                     //
 /*!
  * \brief Framework-specific function pointers table for DLPack exchange.
  *
- * Array/Tensor librarie should statically create and initialize this structure
+ * Guidelines for leveraging DLPackExchangeAPI:
+ *
+ * There are generally two kinds of consumer needs for DLPack exchange:
+ * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
+ *       with the data from x, y, z. The consumer is also expected to run the kernel with the same
+ *       stream context as the producer. For example, when x, y, z is torch.Tensor,
+ *       consumer should query exchange_api->optional_current_work_stream to get the
+ *       current stream and launch the kernel with the same stream.
+ *       This setup is necessary for no synchronization in kernel launch and maximum compatibility
+ *       with CUDA graph capture in the producer.
+ *       This is the desirable behavior for library extension support for frameworks like PyTorch.
+ * - N1: data ingestion and retention, in such a case, the consumer is interested in obtaining
+ *       the data from the producer and runs further computation on its own stream.
+ *       In such a case, the consumer can directly query optional_last_active_stream to
+ *       get the last active stream and record a dependency.
+ *
+ * Consumer should consider their needs (N0 or N1) and act accordingly based on the
+ * availability of the function pointer.
+ *
+ * Importantly, optional_current_work_stream may be NULL for frameworks that
+ * do not rely on a context manager to manage the stream, in which case the consumer
+ * should rely on the information in optional_last_active_stream.
+ *
+ * Array/Tensor libraries should statically create and initialize this structure
  * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
- * The DLPackExchangeAPI* should stay alive throughout the lifetime of process.
+ * The DLPackExchangeAPI* should stay alive throughout the lifetime of the process.
  *
  * One simple way to do so is to create a static instance of DLPackExchangeAPI
- * within the framework and return a pointer to it, the following code
- * shows an example to do so in c++. It should also be reasonably easy
+ * within the framework and return a pointer to it. The following code
+ * shows an example to do so in C++. It should also be reasonably easy
  * to do so in other languages.
  *
  * \code
@@ -474,6 +520,8 @@ typedef int (*DLPackManagedTensorToPyObject)(                     //
  *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
  *     managed_tensor_from_py_object = MyDLPackManagedTensorFromPyObject;
  *     managed_tensor_to_py_object = MyDLPackManagedTensorToPyObject
+ *     optional_current_work_stream = MyDLPackCurrentWorkStream;
+ *     prev_version_api = nullptr;
  *  }
  *
  *  static const DLPackExchangeAPI* Global() {
@@ -484,9 +532,9 @@ typedef int (*DLPackManagedTensorToPyObject)(                     //
  * \endcode
  *
  * Each framework should attach a dunder `__c_dlpack_exchange_api__` integer
- * to point to the pointer of the DLPackExchangeAPI*
+ * to point to the DLPackExchangeAPI* pointer.
  *
- * Importantly the attributed should be attached to the class of the Tensor, not the instance.
+ * Importantly, the attribute should be attached to the class of the Tensor, not the instance.
  *
  * mypackage.Tensor.__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
  *
@@ -499,6 +547,14 @@ struct DLPackExchangeAPI {
    * \brief The current DLPack version.
    */
   DLPackVersion version;
+  /*!
+   * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+   *
+   * It should be set to NULL if the framework does not support older versions.
+   *
+   * \sa DLPackExchangeAPI
+   */
+  DLPackExchangeAPI* prev_version_api;
   /*!
    * \brief Framework-specific function pointer for DLPackManagedTensorAllocator
    * \sa DLPackManagedTensorAllocator
@@ -514,6 +570,14 @@ struct DLPackExchangeAPI {
    * \sa DLPackManagedTensorToPyObject
    */
   DLPackManagedTensorToPyObject managed_tensor_to_py_object;
+  /*!
+   * \brief Framework-specific function pointer for DLPackCurrentWorkStream
+   *
+   * This function can be set to NULL if the framework does not rely on context manager to manage the stream.
+   *
+   * \sa DLPackCurrentWorkStream
+   */
+  DLPackCurrentWorkStream optional_current_work_stream;
 };
 
 #ifdef __cplusplus