|
1 | 1 | /*! |
2 | | - * Copyright (c) 2017 by Contributors |
| 2 | + * Copyright (c) 2017 - by Contributors |
3 | 3 | * \file dlpack.h |
4 | 4 | * \brief The common header of DLPack. |
5 | 5 | */ |
@@ -338,7 +338,7 @@ typedef struct DLManagedTensor { |
338 | 338 | * |
339 | 339 | * \note This is the current standard DLPack exchange data structure. |
340 | 340 | */ |
341 | | -struct DLManagedTensorVersioned { |
| 341 | +typedef struct DLManagedTensorVersioned { |
342 | 342 | /*! |
343 | 343 | * \brief The API and ABI version of the current managed Tensor |
344 | 344 | */ |
@@ -372,7 +372,266 @@ struct DLManagedTensorVersioned { |
372 | 372 | uint64_t flags; |
373 | 373 | /*! \brief DLTensor which is being memory managed */ |
374 | 374 | DLTensor dl_tensor; |
375 | | -}; |
| 375 | +} DLManagedTensorVersioned; |
| 376 | + |
| 377 | +//---------------------------------------------------------------------- |
| 378 | +// DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions |
| 379 | +//---------------------------------------------------------------------- |
| 380 | +/*! |
| 381 | + * \brief Request a producer library to create a new tensor. |
| 382 | + * |
| 383 | + * Create a new `DLManagedTensorVersioned` within the context of the producer |
| 384 | + * library. The allocation is defined via the prototype DLTensor. |
| 385 | + * |
| 386 | + * This function is exposed by the framework through the DLPackExchangeAPI. |
| 387 | + * |
| 388 | + * \param prototype The prototype DLTensor. Only the dtype, ndim, shape, |
| 389 | + * and device fields are used. |
| 390 | + * \param out The output DLManagedTensorVersioned. |
| 391 | + * \param error_ctx Context for `SetError`. |
| 392 | + * \param SetError The function to set the error. |
| 393 | + * \return The owning DLManagedTensorVersioned* or NULL on failure. |
| 394 | + * SetError is called exactly when NULL is returned (the implementor |
| 395 | + * must ensure this). |
| 396 | + * \note - As a C function, must not thrown C++ exceptions. |
| 397 | + * - Error propagation via SetError to avoid any direct need |
| 398 | + * of Python API. Due to this `SetError` may have to ensure the GIL is |
| 399 | + * held since it will presumably set a Python error. |
| 400 | + * |
| 401 | + * \sa DLPackExchangeAPI |
| 402 | + */ |
| 403 | +typedef int (*DLPackManagedTensorAllocator)( // |
| 404 | + DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, // |
| 405 | + void (*SetError)(void* error_ctx, const char* kind, const char* message) // |
| 406 | +); |
| 407 | + |
| 408 | +/*! |
| 409 | + * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned. |
| 410 | + * |
| 411 | + * This function does not perform any stream synchronization. The consumer should query |
| 412 | + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. |
| 413 | + * |
| 414 | + * This function is exposed by the framework through the DLPackExchangeAPI. |
| 415 | + * |
| 416 | + * \param py_object The Python object to convert. Must have the same type |
| 417 | + * as the one the `DLPackExchangeAPI` was discovered from. |
| 418 | + * \return The owning DLManagedTensorVersioned* or NULL on failure with a |
| 419 | + * Python exception set. If the data cannot be described using DLPack |
| 420 | + * this should be a BufferError if possible. |
| 421 | + * \note - As a C function, must not thrown C++ exceptions. |
| 422 | + * |
| 423 | + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream |
| 424 | + */ |
| 425 | +typedef int (*DLPackManagedTensorFromPyObjectNoSync)( // |
| 426 | + void* py_object, // |
| 427 | + DLManagedTensorVersioned** out // |
| 428 | +); |
| 429 | + |
| 430 | +/*! |
| 431 | + * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor. |
| 432 | + * |
| 433 | + * This function provides a faster interface for temporary, non-owning, exchange. |
| 434 | + * The producer (implementor) still owns the memory of data, strides, shape. |
| 435 | + * The liveness of the DLTensor and the data it views is only guaranteed until |
| 436 | + * control is returned. |
| 437 | + * |
| 438 | + * This function currently assumes that the producer (implementor) can fill |
| 439 | + * in the DLTensor shape and strides without the need for temporary allocations. |
| 440 | + * |
| 441 | + * This function does not perform any stream synchronization. The consumer should query |
| 442 | + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. |
| 443 | + * |
| 444 | + * This function is exposed by the framework through the DLPackExchangeAPI. |
| 445 | + * |
| 446 | + * \param py_object The Python object to convert. Must have the same type |
| 447 | + * as the one the `DLPackExchangeAPI` was discovered from. |
| 448 | + * \param out The output DLTensor, whose space is pre-allocated on stack. |
| 449 | + * \return 0 on success, -1 on failure with a Python exception set. |
| 450 | + * \note - As a C function, must not thrown C++ exceptions. |
| 451 | + * |
| 452 | + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream |
| 453 | + */ |
| 454 | +typedef int (*DLPackDLTensorFromPyObjectNoSync)( // |
| 455 | + void* py_object, // |
| 456 | + DLTensor* out // |
| 457 | +); |
| 458 | + |
| 459 | +/*! |
| 460 | + * \brief Obtain the current work stream of a device. |
| 461 | + * |
| 462 | + * Obtain the current work stream of a device from the producer framework. |
| 463 | + * For example, it should map to torch.cuda.current_stream in PyTorch. |
| 464 | + * |
| 465 | + * When device_type is kDLCPU, the consumer do not have to query the stream |
| 466 | + * and the producer can simply return NULL when queried. |
| 467 | + * The consumer do not have to do anything on stream sync or setting. |
| 468 | + * So CPU only framework can just provide a dummy implementation that |
| 469 | + * always set out_current_stream[0] to NULL. |
| 470 | + * |
| 471 | + * \param device_type The device type. |
| 472 | + * \param device_id The device id. |
| 473 | + * \param out_current_stream The output current work stream. |
| 474 | + * |
| 475 | + * \return 0 on success, -1 on failure with a Python exception set. |
| 476 | + * \note - As a C function, must not thrown C++ exceptions. |
| 477 | + * |
| 478 | + * \sa DLPackExchangeAPI |
| 479 | + */ |
| 480 | +typedef int (*DLPackCurrentWorkStream)( // |
| 481 | + DLDeviceType device_type, // |
| 482 | + int32_t device_id, // |
| 483 | + void** out_current_stream // |
| 484 | +); |
| 485 | + |
| 486 | +/*! |
| 487 | + * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray. |
| 488 | + * |
| 489 | + * Convert an owning DLManagedTensorVersioned* to the Python tensor of the |
| 490 | + * producer (implementor) library with the correct type. |
| 491 | + * |
| 492 | + * This function does not perform any stream synchronization. |
| 493 | + * |
| 494 | + * This function is exposed by the framework through the DLPackExchangeAPI. |
| 495 | + * |
| 496 | + * \param tensor The DLManagedTensorVersioned to convert the ownership of the |
| 497 | + * tensor is stolen. |
| 498 | + * \param out_py_object The output Python object. |
| 499 | + * \return 0 on success, -1 on failure with a Python exception set. |
| 500 | + * |
| 501 | + * \sa DLPackExchangeAPI |
| 502 | + */ |
| 503 | +typedef int (*DLPackManagedTensorToPyObjectNoSync)( // |
| 504 | + DLManagedTensorVersioned* tensor, // |
| 505 | + void** out_py_object // |
| 506 | +); |
| 507 | + |
| 508 | +/*! |
| 509 | + * \brief DLPackExchangeAPI stable header. |
| 510 | + * \sa DLPackExchangeAPI |
| 511 | + */ |
| 512 | +typedef struct DLPackExchangeAPIHeader { |
| 513 | + /*! |
| 514 | + * \brief The provided DLPack version the consumer must check major version |
| 515 | + * compatibility before using this struct. |
| 516 | + */ |
| 517 | + DLPackVersion version; |
| 518 | + /*! |
| 519 | + * \brief Optional pointer to an older DLPackExchangeAPI in the chain. |
| 520 | + * |
| 521 | + * It must be NULL if the framework does not support older versions. |
| 522 | + * If the current major version is larger than the one supported by the |
| 523 | + * consumer, the consumer may walk this to find an earlier supported version. |
| 524 | + * |
| 525 | + * \sa DLPackExchangeAPI |
| 526 | + */ |
| 527 | + struct DLPackExchangeAPIHeader* prev_api; |
| 528 | +} DLPackExchangeAPIHeader; |
| 529 | + |
| 530 | +/*! |
| 531 | + * \brief Framework-specific function pointers table for DLPack exchange. |
| 532 | + * |
| 533 | + * Additionally to `__dlpack__()` we define a C function table sharable by |
| 534 | + * Python implementations via `__c_dlpack_exchange_api__`. |
| 535 | + * This attribute must be set on the type as a Python integer compatible |
| 536 | + * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`. |
| 537 | + * |
| 538 | + * A consumer library may use a pattern such as: |
| 539 | + * |
| 540 | + * \code |
| 541 | + * |
| 542 | + * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__; // as C-code |
| 543 | + * MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj); |
| 544 | + * if (api == NULL && PyErr_Occurred()) { goto handle_error; } |
| 545 | + * |
| 546 | + * \endcode |
| 547 | + * |
| 548 | + * Note that this must be defined on the type. The consumer should look up the |
| 549 | + * attribute on the type and may cache the result for each unique type. |
| 550 | + * |
| 551 | + * The precise API table is given by: |
| 552 | + * \code |
| 553 | + * struct MyDLPackExchangeAPI : public DLPackExchangeAPI { |
| 554 | + * MyDLPackExchangeAPI() { |
| 555 | + * header.version.major = DLPACK_MAJOR_VERSION; |
| 556 | + * header.version.minor = DLPACK_MINOR_VERSION; |
| 557 | + * header.prev_version_api = nullptr; |
| 558 | + * |
| 559 | + * managed_tensor_allocator = MyDLPackManagedTensorAllocator; |
| 560 | + * managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync; |
| 561 | + * managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync; |
| 562 | + * dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync; |
| 563 | + * current_work_stream = MyDLPackCurrentWorkStream; |
| 564 | + * } |
| 565 | + * |
| 566 | + * static const DLPackExchangeAPI* Global() { |
| 567 | + * static MyDLPackExchangeAPI inst; |
| 568 | + * return &inst; |
| 569 | + * } |
| 570 | + * }; |
| 571 | + * \endcode |
| 572 | + * |
| 573 | + * Guidelines for leveraging DLPackExchangeAPI: |
| 574 | + * |
| 575 | + * There are generally two kinds of consumer needs for DLPack exchange: |
| 576 | + * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel |
| 577 | + * with the data from x, y, z. The consumer is also expected to run the kernel with the same |
| 578 | + * stream context as the producer. For example, when x, y, z is torch.Tensor, |
| 579 | + * consumer should query exchange_api->current_work_stream to get the |
| 580 | + * current stream and launch the kernel with the same stream. |
| 581 | + * This setup is necessary for no synchronization in kernel launch and maximum compatibility |
| 582 | + * with CUDA graph capture in the producer. |
| 583 | + * This is the desirable behavior for library extension support for frameworks like PyTorch. |
| 584 | + * - N1: data ingestion and retention |
| 585 | + * |
| 586 | + * Note that obj.__dlpack__() API should provide useful ways for N1. |
| 587 | + * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0 |
| 588 | + * with the support of the function pointer current_work_stream. |
| 589 | + * |
| 590 | + * Array/Tensor libraries should statically create and initialize this structure |
| 591 | + * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array. |
| 592 | + * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process. |
| 593 | + * |
| 594 | + * One simple way to do so is to create a static instance of DLPackExchangeAPI |
| 595 | + * within the framework and return a pointer to it. The following code |
| 596 | + * shows an example to do so in C++. It should also be reasonably easy |
| 597 | + * to do so in other languages. |
| 598 | + */ |
| 599 | +typedef struct DLPackExchangeAPI { |
| 600 | + /*! |
| 601 | + * \brief The header that remains stable across versions. |
| 602 | + */ |
| 603 | + DLPackExchangeAPIHeader header; |
| 604 | + /*! |
| 605 | + * \brief Producer function pointer for DLPackManagedTensorAllocator |
| 606 | + * This function must not be NULL. |
| 607 | + * \sa DLPackManagedTensorAllocator |
| 608 | + */ |
| 609 | + DLPackManagedTensorAllocator managed_tensor_allocator; |
| 610 | + /*! |
| 611 | + * \brief Producer function pointer for DLPackManagedTensorFromPyObject |
| 612 | + * This function must be not NULL. |
| 613 | + * \sa DLPackManagedTensorFromPyObject |
| 614 | + */ |
| 615 | + DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync; |
| 616 | + /*! |
| 617 | + * \brief Producer function pointer for DLPackManagedTensorToPyObject |
| 618 | + * This function must be not NULL. |
| 619 | + * \sa DLPackManagedTensorToPyObject |
| 620 | + */ |
| 621 | + DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync; |
| 622 | + /*! |
| 623 | + * \brief Producer function pointer for DLPackDLTensorFromPyObject |
| 624 | + * This function can be NULL when the producer does not support this function. |
| 625 | + * \sa DLPackDLTensorFromPyObjectNoSync |
| 626 | + */ |
| 627 | + DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync; |
| 628 | + /*! |
| 629 | + * \brief Producer function pointer for DLPackCurrentWorkStream |
| 630 | + * This function must be not NULL. |
| 631 | + * \sa DLPackCurrentWorkStream |
| 632 | + */ |
| 633 | + DLPackCurrentWorkStream current_work_stream; |
| 634 | +} DLPackExchangeAPI; |
376 | 635 |
|
377 | 636 | #ifdef __cplusplus |
378 | 637 | } // DLPACK_EXTERN_C |
|
0 commit comments