Merge branch 'main' into strides-compact

tqchen · web-flow · commit 6cab30112cc2 · 2025-10-11T10:45:58.000-04:00
diff --git a/include/dlpack/dlpack.h b/include/dlpack/dlpack.h
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2017 by Contributors
+ *  Copyright (c) 2017 -  by Contributors
  * \file dlpack.h
  * \brief The common header of DLPack.
  */
@@ -338,7 +338,7 @@ typedef struct DLManagedTensor {
  *
  * \note This is the current standard DLPack exchange data structure.
  */
-struct DLManagedTensorVersioned {
+typedef struct DLManagedTensorVersioned {
   /*!
    * \brief The API and ABI version of the current managed Tensor
    */
@@ -372,7 +372,266 @@ struct DLManagedTensorVersioned {
   uint64_t flags;
   /*! \brief DLTensor which is being memory managed */
   DLTensor dl_tensor;
-};
+} DLManagedTensorVersioned;
+
+//----------------------------------------------------------------------
+// DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
+//----------------------------------------------------------------------
+/*!
+ * \brief Request a producer library to create a new tensor.
+ *
+ * Create a new `DLManagedTensorVersioned` within the context of the producer
+ * library. The allocation is defined via the prototype DLTensor.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+ *        and device fields are used.
+ * \param out The output DLManagedTensorVersioned.
+ * \param error_ctx Context for `SetError`.
+ * \param SetError The function to set the error.
+ * \return The owning DLManagedTensorVersioned* or NULL on failure.
+ *         SetError is called exactly when NULL is returned (the implementor
+ *         must ensure this).
+ * \note - As a C function, must not thrown C++ exceptions.
+ *       - Error propagation via SetError to avoid any direct need
+ *         of Python API. Due to this `SetError` may have to ensure the GIL is
+ *         held since it will presumably set a Python error.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorAllocator)(                                         //
+  DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,              //
+  void (*SetError)(void* error_ctx, const char* kind, const char* message)           //
+);
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ *        as the one the `DLPackExchangeAPI` was discovered from.
+ * \return The owning DLManagedTensorVersioned* or NULL on failure with a
+ *         Python exception set. If the data cannot be described using DLPack
+ *         this should be a BufferError if possible.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackManagedTensorFromPyObjectNoSync)(                 //
+  void* py_object,                                                    //
+  DLManagedTensorVersioned** out                                      //
+);
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+ *
+ * This function provides a faster interface for temporary, non-owning, exchange.
+ * The producer (implementor) still owns the memory of data, strides, shape.
+ * The liveness of the DLTensor and the data it views is only guaranteed until
+ * control is returned.
+ *
+ * This function currently assumes that the producer (implementor) can fill
+ * in the DLTensor shape and strides without the need for temporary allocations.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ *        as the one the `DLPackExchangeAPI` was discovered from.
+ * \param out The output DLTensor, whose space is pre-allocated on stack.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackDLTensorFromPyObjectNoSync)(                      //
+  void* py_object,                                                    //
+  DLTensor* out                                                       //
+);
+
+/*!
+ * \brief Obtain the current work stream of a device.
+ *
+ * Obtain the current work stream of a device from the producer framework.
+ * For example, it should map to torch.cuda.current_stream in PyTorch.
+ *
+ * When device_type is kDLCPU, the consumer do not have to query the stream
+ * and the producer can simply return NULL when queried.
+ * The consumer do not have to do anything on stream sync or setting.
+ * So CPU only framework can just provide a dummy implementation that
+ * always set out_current_stream[0] to NULL.
+ *
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param out_current_stream The output current work stream.
+ *
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackCurrentWorkStream)(                         //
+  DLDeviceType device_type,                                     //
+  int32_t device_id,                                            //
+  void** out_current_stream                                     //
+);
+
+/*!
+ * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+ *
+ * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+ * producer (implementor) library with the correct type.
+ *
+ * This function does not perform any stream synchronization.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param tensor The DLManagedTensorVersioned to convert the ownership of the
+ *        tensor is stolen.
+ * \param out_py_object The output Python object.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorToPyObjectNoSync)(                //
+  DLManagedTensorVersioned* tensor,                                //
+  void** out_py_object                                             //
+);
+
+/*!
+ * \brief DLPackExchangeAPI stable header.
+ * \sa DLPackExchangeAPI
+ */
+typedef struct DLPackExchangeAPIHeader {
+  /*!
+   * \brief The provided DLPack version the consumer must check major version
+   *        compatibility before using this struct.
+   */
+  DLPackVersion version;
+  /*!
+   * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+   *
+   * It must be NULL if the framework does not support older versions.
+   * If the current major version is larger than the one supported by the
+   * consumer, the consumer may walk this to find an earlier supported version.
+   *
+   * \sa DLPackExchangeAPI
+   */
+  struct DLPackExchangeAPIHeader* prev_api;
+} DLPackExchangeAPIHeader;
+
+/*!
+ * \brief Framework-specific function pointers table for DLPack exchange.
+ *
+ * Additionally to `__dlpack__()` we define a C function table sharable by
+ * Python implementations via `__c_dlpack_exchange_api__`.
+ * This attribute must be set on the type as a Python integer compatible
+ * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
+ *
+ * A consumer library may use a pattern such as:
+ *
+ * \code
+ *
+ * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__;  // as C-code
+ * MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
+ * if (api == NULL && PyErr_Occurred()) { goto handle_error; }
+ *
+ * \endcode
+ *
+ * Note that this must be defined on the type. The consumer should look up the
+ * attribute on the type and may cache the result for each unique type.
+ *
+ * The precise API table is given by:
+ * \code
+ * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
+ *   MyDLPackExchangeAPI() {
+ *     header.version.major = DLPACK_MAJOR_VERSION;
+ *     header.version.minor = DLPACK_MINOR_VERSION;
+ *     header.prev_version_api = nullptr;
+ *
+ *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
+ *     managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync;
+ *     managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync;
+ *     dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync;
+ *     current_work_stream = MyDLPackCurrentWorkStream;
+ *  }
+ *
+ *  static const DLPackExchangeAPI* Global() {
+ *     static MyDLPackExchangeAPI inst;
+ *     return &inst;
+ *  }
+ * };
+ * \endcode
+ *
+ * Guidelines for leveraging DLPackExchangeAPI:
+ *
+ * There are generally two kinds of consumer needs for DLPack exchange:
+ * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
+ *       with the data from x, y, z. The consumer is also expected to run the kernel with the same
+ *       stream context as the producer. For example, when x, y, z is torch.Tensor,
+ *       consumer should query exchange_api->current_work_stream to get the
+ *       current stream and launch the kernel with the same stream.
+ *       This setup is necessary for no synchronization in kernel launch and maximum compatibility
+ *       with CUDA graph capture in the producer.
+ *       This is the desirable behavior for library extension support for frameworks like PyTorch.
+ * - N1: data ingestion and retention
+ *
+ * Note that obj.__dlpack__() API should provide useful ways for N1.
+ * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
+ * with the support of the function pointer current_work_stream.
+ *
+ * Array/Tensor libraries should statically create and initialize this structure
+ * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
+ * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
+ *
+ * One simple way to do so is to create a static instance of DLPackExchangeAPI
+ * within the framework and return a pointer to it. The following code
+ * shows an example to do so in C++. It should also be reasonably easy
+ * to do so in other languages.
+ */
+typedef struct DLPackExchangeAPI {
+  /*!
+   * \brief The header that remains stable across versions.
+   */
+  DLPackExchangeAPIHeader header;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorAllocator
+   *        This function must not be NULL.
+   * \sa DLPackManagedTensorAllocator
+   */
+  DLPackManagedTensorAllocator managed_tensor_allocator;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorFromPyObject
+   *        This function must be not NULL.
+   * \sa DLPackManagedTensorFromPyObject
+   */
+  DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorToPyObject
+   *        This function must be not NULL.
+   * \sa DLPackManagedTensorToPyObject
+   */
+  DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackDLTensorFromPyObject
+   *        This function can be NULL when the producer does not support this function.
+   * \sa DLPackDLTensorFromPyObjectNoSync
+   */
+  DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackCurrentWorkStream
+   *        This function must be not NULL.
+   * \sa DLPackCurrentWorkStream
+   */
+  DLPackCurrentWorkStream current_work_stream;
+} DLPackExchangeAPI;
 
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C