@@ -362,98 +362,79 @@ typedef struct DLManagedTensorVersioned {
362362 DLTensor dl_tensor;
363363} DLManagedTensorVersioned;
364364
365- // --------------------------------------------------------------------
366- // DLPack C functions for speed exchange
367- // --------------------------------------------------------------------
365+ // ----------------------------------------------------------------------
366+ // DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
367+ // ----------------------------------------------------------------------
368368/* !
369- * \brief A generic C-style allocator that exposes allocation of a Tensor/Array .
369+ * \brief Request a producer library to create a new tensor .
370370 *
371- * This information can then be used to set allocators of a callee to run allocations.
372- * This information can then be used to set the callee's allocator to perform allocations.
373- * This function can be exposed by the framework through the DLPackExchangeAPI.
371+ * Create a new `DLManagedTensorVersioned` within the context of the producer
372+ * library. The allocation is defined via the prototype DLTensor.
374373 *
375- * This particular function does not assume a Python environment; as a result,
376- * the error handling mechanism is different from Python-related functions.
374+ * This function is exposed by the framework through the DLPackExchangeAPI.
377375 *
378- * \param prototype The prototype DLTensor to offer details about the device and shape.
379- * Other field information will be ignored during allocation .
376+ * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
377+ * and device fields are used .
380378 * \param out The output DLManagedTensorVersioned.
381- * \param error_ctx The context to set the error .
379+ * \param error_ctx Context for `SetError` .
382380 * \param SetError The function to set the error.
383- * \return 0 on success, -1 on failure.
384- * The callee should call SetError(error_ctx, kind, message) to set the error kind and message.
385- * \note Error propagation via SetError.
381+ * \return The owning DLManagedTensorVersioned* or NULL on failure.
382+ * SetError is called exactly when NULL is returned (the implementor
383+ * must ensure this).
384+ * \note - As a C function, must not thrown C++ exceptions.
385+ * - Error propagation via SetError to avoid any direct need
386+ * of Python API. Due to this `SetError` may have to ensure the GIL is
387+ * held since it will presumably set a Python error.
386388 *
387389 * \sa DLPackExchangeAPI
388390 */
389- typedef int (*DLPackManagedTensorAllocator)( //
390- DLTensor* prototype, DLManagedTensorVersioned** out, void * error_ctx, //
391- void (*SetError)(void * error_ctx, const char * kind, const char * message) //
391+ typedef int (*DLPackManagedTensorAllocator)( //
392+ DLTensor* prototype, DLManagedTensorVersioned** out, void * error_ctx, //
393+ void (*SetError)(void * error_ctx, const char * kind, const char * message) //
392394);
393395
394396/* !
395397 * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
396398 *
397- * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
398- * to a DLManagedTensorVersioned without going through the Python interpreter.
399- *
400399 * This function does not perform any stream synchronization. The consumer should query
401400 * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
402401 *
403402 * This function is exposed by the framework through the DLPackExchangeAPI.
404403 *
405- * This information can then be picked up by importers and libraries to perform a fast conversion.
406- * This function should not throw any exceptions; if it fails, it should return -1 and
407- * set the error message via PyErr_SetXXX.
408- *
409- * \param py_object The Python object to convert; this should be PyObject*.
410- * We use void* to avoid dependency on Python.h.
411- *
412- * \param out The output DLManagedTensorVersioned.
413- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
414- * \note We use void* to avoid dependency on Python.h, so this specific type is
415- * not dependent on Python.h and can be copied to dlpack.h.
404+ * \param py_object The Python object to convert. Must have the same type
405+ * as the one the `DLPackExchangeAPI` was discovered from.
406+ * \return The owning DLManagedTensorVersioned* or NULL on failure with a
407+ * Python exception set. If the data cannot be described using DLPack
408+ * this should be a BufferError if possible.
409+ * \note - As a C function, must not thrown C++ exceptions.
416410 *
417411 * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
418412 */
419- typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
420- void * py_object, //
421- DLManagedTensorVersioned** out //
413+ typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
414+ void * py_object, DLManagedTensorVersioned** out //
422415);
423416
424417/* !
425- * \brief Exports a PyObject* Tensor/NDArray to a DLTensor whose space is pre-allocated on stack .
418+ * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor .
426419 *
427- * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
428- * to a DLTensor whose space is pre-allocated on stack without going through the Python interpreter.
420+ * This function provides a faster interface for temporary, non-owning, exchange.
421+ * The producer (implementor) still owns the memory of data, strides, shape.
422+ * The liveness of the DLTensor and the data it views is only guaranteed until
423+ * control is returned.
429424 *
430- * This is an non-owning conversion, the producer still owns the memory of data, strides, shape.
431- * The liveness of DLTensor is only guaranteed until the consumer returns control to the caller.
432- *
433- * In the context of this function, we expect the producer to allocated space for data, strides and shape.
425+ * This function currently assumes that the producer (implementor) can fill
426+ * in the DLTensor shape and strides without the need for temporary allocations.
434427 *
435428 * This function does not perform any stream synchronization. The consumer should query
436429 * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
437430 *
438- * This function is useful when the consumer do not need to retain the tensor memory.
439- * It generally can provide about 2x faster conversion than DLPackManagedTensorFromPyObjectNoSync.
440- *
441- * For cases where consumer may needs to reorganize the tensor memory via temporary managed copy,
442- * DLPackManagedTensorFromPyObjectNoSync should be used.
443- *
444431 * This function is exposed by the framework through the DLPackExchangeAPI.
445432 *
446- * This information can then be picked up by importers and libraries to perform a fast conversion.
447- * This function should not throw any exceptions; if it fails, it should return -1 and
448- * set the error message via PyErr_SetXXX.
449- *
450- * \param py_object The Python object to convert; this should be PyObject*.
451- * We use void* to avoid dependency on Python.h.
452- *
433+ * \param py_object The Python object to convert. Must have the same type
434+ * as the one the `DLPackExchangeAPI` was discovered from.
453435 * \param out The output DLTensor, whose space is pre-allocated on stack.
454- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
455- * \note We use void* to avoid dependency on Python.h, so this specific type is
456- * not dependent on Python.h and can be copied to dlpack.h.
436+ * \return 0 on success, -1 on failure with a Python exception set.
437+ * \note - As a C function, must not thrown C++ exceptions.
457438 *
458439 * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
459440 */
@@ -465,21 +446,18 @@ typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
465446/* !
466447 * \brief Obtain the current work stream of a device.
467448 *
468- * This function is a C-style function pointer to obtain the current work stream
469- * of a device for frameworks that rely on a context manager to manage the stream.
449+ * Obtain the current work stream of a device from the producer framework.
470450 * For example, it should map to torch.cuda.current_stream in PyTorch.
471451 *
472- * This function can be set to NULL if the framework does not rely on a context manager
473- * to manage the stream. However, we encourage frameworks to provide this function
474- * if possible.
475- *
476- * As if this field is not set, likely consumer cannot safely do stream based
477- * exchange based on the
478- *
479452 * \param device_type The device type.
480453 * \param device_id The device id.
481454 * \param out_current_stream The output current work stream.
482- * \return 0 on success, -1 on failure.
455+ * Producer can return reinterpret_cast<void*>(-1)
456+ * to indicate that no stream is available, the consumer
457+ * should not do stream sync in such case.
458+ *
459+ * \return 0 on success, -1 on failure with a Python exception set.
460+ * \note - As a C function, must not thrown C++ exceptions.
483461 *
484462 * \sa DLPackExchangeAPI
485463 */
@@ -492,54 +470,43 @@ typedef int (*DLPackCurrentWorkStream)( //
492470/* !
493471 * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
494472 *
495- * This function is a C-style function pointer to quickly convert a DLManagedTensorVersioned
496- * to a PyObject* without going through the Python Interpreter .
473+ * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
474+ * producer (implementor) library with the correct type .
497475 *
498476 * This function does not perform any stream synchronization.
499477 *
500478 * This function is exposed by the framework through the DLPackExchangeAPI.
501479 *
502- * \param tensor The DLManagedTensorVersioned to convert.
480+ * \param tensor The DLManagedTensorVersioned to convert the ownership of the
481+ * the data is stolen.
503482 * \param out_py_object The output Python object.
504- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
505- * \note We use void* to avoid dependency on Python.h, so this specific type is
506- * not dependent on Python.h and can be copied to dlpack.h.
483+ * \return 0 on success, -1 on failure with a Python exception set.
507484 *
508485 * \sa DLPackExchangeAPI
509486 */
510- typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
511- DLManagedTensorVersioned* tensor, void ** out_py_object //
487+ typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
488+ DLManagedTensorVersioned* tensor, //
489+ void ** out_py_object //
512490);
513491
514492/* !
515493 * \brief Framework-specific function pointers table for DLPack exchange.
516494 *
517- * Guidelines for leveraging DLPackExchangeAPI:
495+ * Additionally to `__dlpack__()` we define a C function table sharable by
496+ * Python implementations via `__c_dlpack_exchange_api__`.
497+ * This attribute must be set on the type as a Python integer compatible
498+ * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
518499 *
519- * There are generally two kinds of consumer needs for DLPack exchange:
520- * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
521- * with the data from x, y, z. The consumer is also expected to run the kernel with the same
522- * stream context as the producer. For example, when x, y, z is torch.Tensor,
523- * consumer should query exchange_api->current_work_stream to get the
524- * current stream and launch the kernel with the same stream.
525- * This setup is necessary for no synchronization in kernel launch and maximum compatibility
526- * with CUDA graph capture in the producer.
527- * This is the desirable behavior for library extension support for frameworks like PyTorch.
528- * - N1: data ingestion and retention
529- *
530- * Note that obj.__dlpack__() API should provide useful ways for N1.
531- * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
532- * with the support of the function pointer current_work_stream.
533- *
534- * Array/Tensor libraries should statically create and initialize this structure
535- * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
536- * The DLPackExchangeAPI* should stay alive throughout the lifetime of the process.
537- *
538- * One simple way to do so is to create a static instance of DLPackExchangeAPI
539- * within the framework and return a pointer to it. The following code
540- * shows an example to do so in C++. It should also be reasonably easy
541- * to do so in other languages.
500+ * A consumer library may use a pattern such as:
501+ * \code
502+ * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__; // as C-code
503+ * MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
504+ * if (api == NULL && PyErr_Occurred()) { goto handle_error; }
505+ * \endcode
506+ * Note that this must be defined on the type. The consumer should look up the
507+ * attribute on the type and may cache the result for each unique type.
542508 *
509+ * The precise API table is given by:
543510 * \code
544511 * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
545512 * MyDLPackExchangeAPI() {
@@ -560,55 +527,75 @@ typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
560527 * };
561528 * \endcode
562529 *
563- * Each framework should attach a dunder `__c_dlpack_exchange_api__` integer
564- * to point to the DLPackExchangeAPI* pointer.
530+ * Guidelines for leveraging DLPackExchangeAPI:
565531 *
566- * Importantly, the attribute should be attached to the class of the Tensor, not the instance.
532+ * There are generally two kinds of consumer needs for DLPack exchange:
533+ * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
534+ * with the data from x, y, z. The consumer is also expected to run the kernel with the same
535+ * stream context as the producer. For example, when x, y, z is torch.Tensor,
536+ * consumer should query exchange_api->current_work_stream to get the
537+ * current stream and launch the kernel with the same stream.
538+ * This setup is necessary for no synchronization in kernel launch and maximum compatibility
539+ * with CUDA graph capture in the producer.
540+ * This is the desirable behavior for library extension support for frameworks like PyTorch.
541+ * - N1: data ingestion and retention
567542 *
568- * mypackage.Tensor.__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
543+ * Note that obj.__dlpack__() API should provide useful ways for N1.
544+ * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
545+ * with the support of the function pointer current_work_stream.
569546 *
570- * or equivalently:
547+ * Array/Tensor libraries should statically create and initialize this structure
548+ * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
549+ * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
571550 *
572- * type(tensor_obj).__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
551+ * One simple way to do so is to create a static instance of DLPackExchangeAPI
552+ * within the framework and return a pointer to it. The following code
553+ * shows an example to do so in C++. It should also be reasonably easy
554+ * to do so in other languages.
573555 */
574556struct DLPackExchangeAPI {
575557 /* !
576- * \brief The current DLPack version.
558+ * \brief The provided DLPack version the consumer must check major version
559+ * compatibility before using this struct.
577560 */
578561 DLPackVersion version;
579562 /* !
580563 * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
581564 *
582- * It should be set to NULL if the framework does not support older versions.
565+ * It must be NULL if the framework does not support older versions.
566+ * If the current major version is larger than the one supported by the
567+ * consumer, the consumer may walk this to find an earlier supported version.
583568 *
584569 * \sa DLPackExchangeAPI
585570 */
586571 struct DLPackExchangeAPI * prev_version_api;
587572 /* !
588- * \brief Framework-specific function pointer for DLPackManagedTensorAllocator
573+ * \brief Producer function pointer for DLPackManagedTensorAllocator
574+ * This function must be not NULL.
589575 * \sa DLPackManagedTensorAllocator
590576 */
591577 DLPackManagedTensorAllocator managed_tensor_allocator;
592578 /* !
593- * \brief Framework-specific function pointer for DLPackManagedTensorFromPyObject
579+ * \brief Producer function pointer for DLPackManagedTensorFromPyObject
580+ * This function must be not NULL.
594581 * \sa DLPackManagedTensorFromPyObject
595582 */
596583 DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
597584 /* !
598- * \brief Framework-specific function pointer for DLPackManagedTensorToPyObject
585+ * \brief Producer function pointer for DLPackManagedTensorToPyObject
586+ * This function must be not NULL.
599587 * \sa DLPackManagedTensorToPyObject
600588 */
601589 DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
602590 /* !
603- * \brief Framework-specific function pointer for DLPackDLTensorFromPyObject
591+ * \brief Producer function pointer for DLPackDLTensorFromPyObject
592+ * This function can be NULL when the producer does not support this function.
604593 * \sa DLPackDLTensorFromPyObjectNoSync
605594 */
606595 DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
607596 /* !
608- * \brief Framework-specific function pointer for DLPackCurrentWorkStream
609- *
610- * This function can be set to NULL if the framework does not rely on context manager to manage the stream.
611- *
597+ * \brief Producer function pointer for DLPackCurrentWorkStream
598+ * This function must be not NULL.
612599 * \sa DLPackCurrentWorkStream
613600 */
614601 DLPackCurrentWorkStream current_work_stream;
0 commit comments