From 256a5eee9d19a6df1387937b2e8796f83bb55df1 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Thu, 2 Oct 2025 06:22:49 +0000
Subject: [PATCH 01/17] [DLPack] Implement C functions exchange API

---
 paddle/fluid/framework/dlpack_tensor.cc       |  4 +-
 paddle/fluid/framework/dlpack_tensor.h        | 14 ++--
 paddle/fluid/pybind/pybind.cc                 | 70 +++++++++++++++++++
 .../base/dygraph/tensor_patch_methods.py      |  3 +
 4 files changed, 83 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 793d0bbdf6e695..e01964966d2727 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -265,7 +265,7 @@ ::DLDataType PhiDataTypeToDLDataType(phi::DataType dtype) {
       framework::TransToProtoVarType(dtype));
 }
 
-phi::Place DLDeviceToPlace(const DLDevice &dl_device) {
+phi::Place DLDeviceToPlace(const ::DLDevice &dl_device) {
   phi::Place place;
   if (dl_device.device_type == kDLCPU) {
     place = phi::CPUPlace();
@@ -279,7 +279,7 @@ phi::Place DLDeviceToPlace(const DLDevice &dl_device) {
   return place;
 }
 
-DLDevice PlaceToDLDevice(const phi::Place &place) {
+::DLDevice PlaceToDLDevice(const phi::Place &place) {
   return phi::VisitPlace(place, internal::DLDeviceVisitor());
 }
 
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index e287ce342fa78c..ed799a192f83f9 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -29,15 +29,17 @@ and paddle/phi/api/lib/tensor_utils.cc
 */
 using Deleter = std::function<void(void*)>;
 
-phi::Place DLDeviceToPlace(const DLDevice& device);
-DLDevice PlaceToDLDevice(const phi::Place& place);
+::DLDataType PhiDataTypeToDLDataType(phi::DataType dtype);
+phi::DataType DLDataTypeToPhiDataType(::DLDataType type);
+phi::Place DLDeviceToPlace(const ::DLDevice& device);
+::DLDevice PlaceToDLDevice(const phi::Place& place);
 
 TEST_API DLManagedTensor* ToDLPack(const phi::DenseTensor& src,
                                    uint64_t flags = 0);
-DLManagedTensorVersioned* ToDLPackVersioned(const phi::DenseTensor& src,
-                                            uint64_t flags = 0);
-TEST_API phi::DenseTensor FromDLPack(DLManagedTensor* src);
-phi::DenseTensor FromDLPackVersioned(DLManagedTensorVersioned* src);
+::DLManagedTensorVersioned* ToDLPackVersioned(const phi::DenseTensor& src,
+                                              uint64_t flags = 0);
+TEST_API phi::DenseTensor FromDLPack(::DLManagedTensor* src);
+phi::DenseTensor FromDLPackVersioned(::DLManagedTensorVersioned* src);
 
 // A traits to support both DLManagedTensor and DLManagedTensorVersioned
 template <typename T>
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d3b17ad377b7cf..d2c7b52f272af4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -763,6 +763,64 @@ class PyLayerBlockContextManager {
   PyLayerBlockContextManager() = default;
 };
 
+int DLPackFromPyObject(void *py_obj,
+                       DLManagedTensorVersioned **out,
+                       void **env_stream) {
+  try {
+    py::handle handle(static_cast<PyObject *>(py_obj));
+    paddle::Tensor tensor = handle.cast<paddle::Tensor>();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
+    if (env_stream != nullptr && tensor.is_gpu()) {
+      int device_index = tensor.place().GetDeviceId();
+      *env_stream = platform::get_current_stream(device_index)->raw_stream();
+    }
+#endif
+    std::shared_ptr<phi::DenseTensor> dense_tensor =
+        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
+    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+int DLPackToPyObject(DLManagedTensorVersioned *src, void **py_obj_out) {
+  try {
+    phi::DenseTensor dense_tensor = paddle::framework::FromDLPackVersioned(src);
+    paddle::Tensor tensor(std::make_shared<phi::DenseTensor>(dense_tensor));
+    egr::EagerUtils::autograd_meta(&tensor)->SetPersistable(false);
+    *py_obj_out = ToPyObject(tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+int DLPackTensorAllocator(::DLTensor *prototype,
+                          ::DLManagedTensorVersioned **out,
+                          void *error_ctx,
+                          void (*SetError)(void *error_ctx,
+                                           const char *kind,
+                                           const char *message)) {
+  try {
+    phi::IntArray shape(prototype->shape, prototype->ndim);
+    phi::Place place(paddle::framework::DLDeviceToPlace(prototype->device));
+    phi::DataType dtype =
+        paddle::framework::DLDataTypeToPhiDataType(prototype->dtype);
+    paddle::Tensor tensor = paddle::empty(shape, dtype, place);
+    std::shared_ptr<phi::DenseTensor> dense_tensor =
+        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
+    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    SetError(error_ctx, "DLPackTensorAllocator", e.what());
+    return -1;
+  }
+}
+
 // NOTE: use to load file by Mmap
 enum MMapLoadModes {
   ALLOCATOR_MAPPED_SHARED = 1,
@@ -1773,6 +1831,18 @@ PYBIND11_MODULE(libpaddle, m) {
                           dl_device.device_id);
   });
 
+  m.def("dlpack_from_pyobject_ptr", []() -> int64_t {
+    return reinterpret_cast<int64_t>(DLPackFromPyObject);
+  });
+
+  m.def("dlpack_to_pyobject_ptr", []() -> int64_t {
+    return reinterpret_cast<int64_t>(DLPackToPyObject);
+  });
+
+  m.def("dlpack_tensor_allocator_ptr", []() -> int64_t {
+    return reinterpret_cast<int64_t>(DLPackTensorAllocator);
+  });
+
   m.def("from_dlpack", [](py::object data) {
     if (PyCapsule_IsValid(data.ptr(),
                           DLPackTraits<DLManagedTensorVersioned>::capsule)) {
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index e19d5e7f8405d1..2650ebd77f5a29 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -1586,6 +1586,9 @@ def __tvm_ffi_env_stream__(self) -> int:
         ("__dlpack_device__", __dlpack_device__),
         ("get_device", get_device),
         ("__tvm_ffi_env_stream__", __tvm_ffi_env_stream__),
+        ("__c_dlpack_from_pyobject__", core.dlpack_from_pyobject_ptr()),
+        ("__c_dlpack_to_pyobject__", core.dlpack_to_pyobject_ptr()),
+        ("__c_dlpack_tensor_allocator__", core.dlpack_tensor_allocator_ptr()),
     ):
         setattr(core.eager.Tensor, method_name, method)
 

From 1fb670edb3f6a248dd6bb7af1e9bedeb7264f2bd Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Sat, 11 Oct 2025 04:51:31 +0000
Subject: [PATCH 02/17] Implement the latest C dlpack exchange API, refer to
 apache/tvm-ffi#96

---
 paddle/fluid/framework/dlpack_tensor.cc       |  16 +++
 paddle/fluid/framework/dlpack_tensor.h        |   6 +-
 paddle/fluid/pybind/pybind.cc                 | 132 ++++++++++++++++--
 .../base/dygraph/tensor_patch_methods.py      |   1 +
 python/paddle/utils/dlpack.py                 |   3 +-
 .../test_tensor_attr_consistency.py           |   4 +
 third_party/dlpack                            |   2 +-
 7 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index e01964966d2727..02b27cbe0ef9ad 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -358,6 +358,22 @@ DLManagedTensorVersioned *ToDLPackVersioned(const phi::DenseTensor &src,
   return ToDLPackImpl<DLManagedTensorVersioned>(src, flags);
 }
 
+void ToDLPackNonOwningImpl(const phi::DenseTensor &tensor,
+                           ::DLTensor &out) {  // NOLINT
+  // Fill in the pre-allocated DLTensor struct with direct pointers
+  // This is a non-owning conversion - the caller owns the tensor
+  // and must keep it alive for the duration of DLTensor usage
+  out.data = const_cast<void *>(tensor.data());
+  out.device = PlaceToDLDevice(tensor.place());
+  out.ndim = static_cast<int32_t>(tensor.dims().size());
+  out.dtype = PhiDataTypeToDLDataType(tensor.dtype());
+  // sizes() and strides() return pointers to TensorImpl's stable storage
+  // which remains valid as long as the tensor is alive
+  out.shape = const_cast<int64_t *>(tensor.dims().Get());
+  out.strides = const_cast<int64_t *>(tensor.strides().Get());
+  out.byte_offset = 0;
+}
+
 template <typename T>
 phi::DenseTensor FromDLPackImpl(T *src, Deleter deleter) {
   std::vector<int64_t> shape_vec;
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index ed799a192f83f9..1aa8e79f93e7de 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -34,10 +34,12 @@ phi::DataType DLDataTypeToPhiDataType(::DLDataType type);
 phi::Place DLDeviceToPlace(const ::DLDevice& device);
 ::DLDevice PlaceToDLDevice(const phi::Place& place);
 
-TEST_API DLManagedTensor* ToDLPack(const phi::DenseTensor& src,
-                                   uint64_t flags = 0);
+TEST_API ::DLManagedTensor* ToDLPack(const phi::DenseTensor& src,
+                                     uint64_t flags = 0);
 ::DLManagedTensorVersioned* ToDLPackVersioned(const phi::DenseTensor& src,
                                               uint64_t flags = 0);
+void ToDLPackNonOwningImpl(const phi::DenseTensor& tensor,
+                           ::DLTensor& out);  // NOLINT
 TEST_API phi::DenseTensor FromDLPack(::DLManagedTensor* src);
 phi::DenseTensor FromDLPackVersioned(::DLManagedTensorVersioned* src);
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d2c7b52f272af4..57edb7db13bfc6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -763,9 +763,9 @@ class PyLayerBlockContextManager {
   PyLayerBlockContextManager() = default;
 };
 
-int DLPackFromPyObject(void *py_obj,
-                       DLManagedTensorVersioned **out,
-                       void **env_stream) {
+int DLPackFromPyObjectLegacy(void *py_obj,
+                             DLManagedTensorVersioned **out,
+                             void **env_stream) {
   try {
     py::handle handle(static_cast<PyObject *>(py_obj));
     paddle::Tensor tensor = handle.cast<paddle::Tensor>();
@@ -786,7 +786,7 @@ int DLPackFromPyObject(void *py_obj,
   }
 }
 
-int DLPackToPyObject(DLManagedTensorVersioned *src, void **py_obj_out) {
+int DLPackToPyObjectLegacy(DLManagedTensorVersioned *src, void **py_obj_out) {
   try {
     phi::DenseTensor dense_tensor = paddle::framework::FromDLPackVersioned(src);
     paddle::Tensor tensor(std::make_shared<phi::DenseTensor>(dense_tensor));
@@ -799,12 +799,12 @@ int DLPackToPyObject(DLManagedTensorVersioned *src, void **py_obj_out) {
   }
 }
 
-int DLPackTensorAllocator(::DLTensor *prototype,
-                          ::DLManagedTensorVersioned **out,
-                          void *error_ctx,
-                          void (*SetError)(void *error_ctx,
-                                           const char *kind,
-                                           const char *message)) {
+int DLPackTensorAllocatorLegacy(::DLTensor *prototype,
+                                ::DLManagedTensorVersioned **out,
+                                void *error_ctx,
+                                void (*SetError)(void *error_ctx,
+                                                 const char *kind,
+                                                 const char *message)) {
   try {
     phi::IntArray shape(prototype->shape, prototype->ndim);
     phi::Place place(paddle::framework::DLDeviceToPlace(prototype->device));
@@ -821,6 +821,108 @@ int DLPackTensorAllocator(::DLTensor *prototype,
   }
 }
 
+int DLPackDLTensorFromPyObjectNoSync(void *py_obj, DLTensor *out) {
+  try {
+    // Use handle (non-owning) to avoid unnecessary refcount operations
+    py::handle handle(static_cast<PyObject *>(py_obj));
+    paddle::Tensor tensor = handle.cast<paddle::Tensor>();
+    std::shared_ptr<phi::DenseTensor> dense_tensor =
+        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
+    paddle::framework::ToDLPackNonOwningImpl(*dense_tensor, *out);
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+int DLPackManagedTensorFromPyObjectNoSync(void *py_obj,
+                                          DLManagedTensorVersioned **out) {
+  try {
+    py::handle handle(static_cast<PyObject *>(py_obj));
+    paddle::Tensor tensor = handle.cast<paddle::Tensor>();
+    std::shared_ptr<phi::DenseTensor> dense_tensor =
+        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
+    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+int DLPackManagedTensorToPyObjectNoSync(DLManagedTensorVersioned *src,
+                                        void **py_obj_out) {
+  try {
+    phi::DenseTensor dense_tensor = paddle::framework::FromDLPackVersioned(src);
+    paddle::Tensor tensor(std::make_shared<phi::DenseTensor>(dense_tensor));
+    egr::EagerUtils::autograd_meta(&tensor)->SetPersistable(false);
+    *py_obj_out = ToPyObject(tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+int DLPackManagedTensorAllocator(::DLTensor *prototype,
+                                 ::DLManagedTensorVersioned **out,
+                                 void *error_ctx,
+                                 void (*SetError)(void *error_ctx,
+                                                  const char *kind,
+                                                  const char *message)) {
+  try {
+    phi::IntArray shape(prototype->shape, prototype->ndim);
+    phi::Place place(paddle::framework::DLDeviceToPlace(prototype->device));
+    phi::DataType dtype =
+        paddle::framework::DLDataTypeToPhiDataType(prototype->dtype);
+    paddle::Tensor tensor = paddle::empty(shape, dtype, place);
+    std::shared_ptr<phi::DenseTensor> dense_tensor =
+        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
+    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
+    return 0;
+  } catch (const std::exception &e) {
+    SetError(error_ctx, "DLPackManagedTensorAllocator", e.what());
+    return -1;
+  }
+}
+
+int DLPackCurrentWorkStream(DLDeviceType device_type,
+                            int32_t device_id,
+                            void **out_stream) {
+  try {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
+    if (device_type == kDLCUDA || device_type == kDLROCM) {
+      *out_stream = platform::get_current_stream(device_id)->raw_stream();
+    }
+#endif
+    return 0;
+  } catch (const std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return -1;
+  }
+}
+
+struct PaddleDLPackExchangeAPI : public ::DLPackExchangeAPI {
+  PaddleDLPackExchangeAPI() {
+    header.version.major = DLPACK_MAJOR_VERSION;
+    header.version.minor = DLPACK_MINOR_VERSION;
+    header.prev_api = nullptr;
+    managed_tensor_allocator = DLPackManagedTensorAllocator;
+    managed_tensor_from_py_object_no_sync =
+        DLPackManagedTensorFromPyObjectNoSync;
+    managed_tensor_to_py_object_no_sync = DLPackManagedTensorToPyObjectNoSync;
+    dltensor_from_py_object_no_sync = DLPackDLTensorFromPyObjectNoSync;
+    current_work_stream = DLPackCurrentWorkStream;
+  }
+
+  static const DLPackExchangeAPI *Instance() {
+    static PaddleDLPackExchangeAPI inst;
+    return &inst;
+  }
+};
+
 // NOTE: use to load file by Mmap
 enum MMapLoadModes {
   ALLOCATOR_MAPPED_SHARED = 1,
@@ -1832,15 +1934,19 @@ PYBIND11_MODULE(libpaddle, m) {
   });
 
   m.def("dlpack_from_pyobject_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackFromPyObject);
+    return reinterpret_cast<int64_t>(DLPackFromPyObjectLegacy);
   });
 
   m.def("dlpack_to_pyobject_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackToPyObject);
+    return reinterpret_cast<int64_t>(DLPackToPyObjectLegacy);
   });
 
   m.def("dlpack_tensor_allocator_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackTensorAllocator);
+    return reinterpret_cast<int64_t>(DLPackTensorAllocatorLegacy);
+  });
+
+  m.def("dlpack_exchange_api_ptr", []() -> int64_t {
+    return reinterpret_cast<int64_t>(PaddleDLPackExchangeAPI::Instance());
   });
 
   m.def("from_dlpack", [](py::object data) {
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 2650ebd77f5a29..12720cdc7ceda9 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -1589,6 +1589,7 @@ def __tvm_ffi_env_stream__(self) -> int:
         ("__c_dlpack_from_pyobject__", core.dlpack_from_pyobject_ptr()),
         ("__c_dlpack_to_pyobject__", core.dlpack_to_pyobject_ptr()),
         ("__c_dlpack_tensor_allocator__", core.dlpack_tensor_allocator_ptr()),
+        ("__c_dlpack_exchange_api__", core.dlpack_exchange_api_ptr()),
     ):
         setattr(core.eager.Tensor, method_name, method)
 
diff --git a/python/paddle/utils/dlpack.py b/python/paddle/utils/dlpack.py
index c1b3c21afaea86..68b44cc27f89ce 100644
--- a/python/paddle/utils/dlpack.py
+++ b/python/paddle/utils/dlpack.py
@@ -75,6 +75,7 @@ class DLDeviceType(enum.IntEnum):
     kDLWebGPU = (15,)
     kDLHexagon = (16,)
     kDLMAIA = (17,)
+    kDLTrn = (18,)
 
 
 def to_dlpack(x: Tensor) -> CapsuleType:
@@ -215,7 +216,7 @@ def from_dlpack(
 
     if hasattr(dlpack, "__dlpack__"):
         kwargs = {}
-        kwargs["max_version"] = (1, 1)
+        kwargs["max_version"] = (1, 2)
         if copy is not None:
             kwargs["copy"] = copy
 
diff --git a/test/dygraph_to_static/test_tensor_attr_consistency.py b/test/dygraph_to_static/test_tensor_attr_consistency.py
index 86a4437a7c69ce..7176daa31928c2 100644
--- a/test/dygraph_to_static/test_tensor_attr_consistency.py
+++ b/test/dygraph_to_static/test_tensor_attr_consistency.py
@@ -81,6 +81,10 @@
         '__dlpack__',
         "__dlpack_device__",
         "__tvm_ffi_env_stream__",
+        "__c_dlpack_from_pyobject__",
+        "__c_dlpack_to_pyobject__",
+        "__c_dlpack_tensor_allocator__",
+        "__c_dlpack_exchange_api__",
     ]
 )
 STATIC_ONLY_TENSOR_ATTRS_ALLOW_LIST = OrderedSet(
diff --git a/third_party/dlpack b/third_party/dlpack
index 3ea601bb413074..111736618e8d10 160000
--- a/third_party/dlpack
+++ b/third_party/dlpack
@@ -1 +1 @@
-Subproject commit 3ea601bb413074c49a77c4ce3218bc08f8c4703c
+Subproject commit 111736618e8d1028b23605f76dcaa6a38cfea809

From 32cb37502aa81161545b2226737cde0f06392054 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Sun, 12 Oct 2025 12:59:26 +0800
Subject: [PATCH 03/17] bump dlpack to v1.2

---
 third_party/dlpack | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/dlpack b/third_party/dlpack
index 111736618e8d10..93c8f2a3c774b8 160000
--- a/third_party/dlpack
+++ b/third_party/dlpack
@@ -1 +1 @@
-Subproject commit 111736618e8d1028b23605f76dcaa6a38cfea809
+Subproject commit 93c8f2a3c774b84af6f652b1992c48164fae60fc

From d49fb45c6c3b4082fe729ba4ef3ab6bd7dfdf5b5 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Sun, 12 Oct 2025 05:35:09 +0000
Subject: [PATCH 04/17] cleanup legacy impls

---
 paddle/fluid/pybind/pybind.cc                 | 70 -------------------
 .../base/dygraph/tensor_patch_methods.py      |  3 -
 .../test_tensor_attr_consistency.py           |  3 -
 3 files changed, 76 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 57edb7db13bfc6..3119464f9cb974 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -763,64 +763,6 @@ class PyLayerBlockContextManager {
   PyLayerBlockContextManager() = default;
 };
 
-int DLPackFromPyObjectLegacy(void *py_obj,
-                             DLManagedTensorVersioned **out,
-                             void **env_stream) {
-  try {
-    py::handle handle(static_cast<PyObject *>(py_obj));
-    paddle::Tensor tensor = handle.cast<paddle::Tensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
-    if (env_stream != nullptr && tensor.is_gpu()) {
-      int device_index = tensor.place().GetDeviceId();
-      *env_stream = platform::get_current_stream(device_index)->raw_stream();
-    }
-#endif
-    std::shared_ptr<phi::DenseTensor> dense_tensor =
-        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
-    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
-    return 0;
-  } catch (const std::exception &e) {
-    PyErr_SetString(PyExc_RuntimeError, e.what());
-    return -1;
-  }
-}
-
-int DLPackToPyObjectLegacy(DLManagedTensorVersioned *src, void **py_obj_out) {
-  try {
-    phi::DenseTensor dense_tensor = paddle::framework::FromDLPackVersioned(src);
-    paddle::Tensor tensor(std::make_shared<phi::DenseTensor>(dense_tensor));
-    egr::EagerUtils::autograd_meta(&tensor)->SetPersistable(false);
-    *py_obj_out = ToPyObject(tensor);
-    return 0;
-  } catch (const std::exception &e) {
-    PyErr_SetString(PyExc_RuntimeError, e.what());
-    return -1;
-  }
-}
-
-int DLPackTensorAllocatorLegacy(::DLTensor *prototype,
-                                ::DLManagedTensorVersioned **out,
-                                void *error_ctx,
-                                void (*SetError)(void *error_ctx,
-                                                 const char *kind,
-                                                 const char *message)) {
-  try {
-    phi::IntArray shape(prototype->shape, prototype->ndim);
-    phi::Place place(paddle::framework::DLDeviceToPlace(prototype->device));
-    phi::DataType dtype =
-        paddle::framework::DLDataTypeToPhiDataType(prototype->dtype);
-    paddle::Tensor tensor = paddle::empty(shape, dtype, place);
-    std::shared_ptr<phi::DenseTensor> dense_tensor =
-        std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
-    *out = paddle::framework::ToDLPackVersioned(*dense_tensor);
-    return 0;
-  } catch (const std::exception &e) {
-    SetError(error_ctx, "DLPackTensorAllocator", e.what());
-    return -1;
-  }
-}
-
 int DLPackDLTensorFromPyObjectNoSync(void *py_obj, DLTensor *out) {
   try {
     // Use handle (non-owning) to avoid unnecessary refcount operations
@@ -1933,18 +1875,6 @@ PYBIND11_MODULE(libpaddle, m) {
                           dl_device.device_id);
   });
 
-  m.def("dlpack_from_pyobject_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackFromPyObjectLegacy);
-  });
-
-  m.def("dlpack_to_pyobject_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackToPyObjectLegacy);
-  });
-
-  m.def("dlpack_tensor_allocator_ptr", []() -> int64_t {
-    return reinterpret_cast<int64_t>(DLPackTensorAllocatorLegacy);
-  });
-
   m.def("dlpack_exchange_api_ptr", []() -> int64_t {
     return reinterpret_cast<int64_t>(PaddleDLPackExchangeAPI::Instance());
   });
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 12720cdc7ceda9..f9545777153f21 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -1586,9 +1586,6 @@ def __tvm_ffi_env_stream__(self) -> int:
         ("__dlpack_device__", __dlpack_device__),
         ("get_device", get_device),
         ("__tvm_ffi_env_stream__", __tvm_ffi_env_stream__),
-        ("__c_dlpack_from_pyobject__", core.dlpack_from_pyobject_ptr()),
-        ("__c_dlpack_to_pyobject__", core.dlpack_to_pyobject_ptr()),
-        ("__c_dlpack_tensor_allocator__", core.dlpack_tensor_allocator_ptr()),
         ("__c_dlpack_exchange_api__", core.dlpack_exchange_api_ptr()),
     ):
         setattr(core.eager.Tensor, method_name, method)
diff --git a/test/dygraph_to_static/test_tensor_attr_consistency.py b/test/dygraph_to_static/test_tensor_attr_consistency.py
index 7176daa31928c2..b68c2db87fe609 100644
--- a/test/dygraph_to_static/test_tensor_attr_consistency.py
+++ b/test/dygraph_to_static/test_tensor_attr_consistency.py
@@ -81,9 +81,6 @@
         '__dlpack__',
         "__dlpack_device__",
         "__tvm_ffi_env_stream__",
-        "__c_dlpack_from_pyobject__",
-        "__c_dlpack_to_pyobject__",
-        "__c_dlpack_tensor_allocator__",
         "__c_dlpack_exchange_api__",
     ]
 )

From bb3c59d3e70ec78c8d953293cfbf9d477703b1d0 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Sun, 12 Oct 2025 06:12:45 +0000
Subject: [PATCH 05/17] add unittests for tvm_ffi

---
 python/unittest_py/requirements.txt |   1 +
 test/legacy_test/test_tvm_ffi.py    | 104 +++++++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index ddfccc8090f240..9d547fd9357d1d 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -20,3 +20,4 @@ xdoctest==1.3.0
 ubelt==1.3.3 # just for xdoctest
 mypy==1.17.1
 soundfile
+apache-tvm-ffi @ git+https://github.com/apache/tvm-ffi.git@22a78943b78306a73011757fa635afa9dce35114
diff --git a/test/legacy_test/test_tvm_ffi.py b/test/legacy_test/test_tvm_ffi.py
index aa6a91b4aa24de..1e95a5db412902 100644
--- a/test/legacy_test/test_tvm_ffi.py
+++ b/test/legacy_test/test_tvm_ffi.py
@@ -14,10 +14,14 @@
 
 import unittest
 
+import numpy as np
+import tvm_ffi.cpp
+from tvm_ffi import Module
+
 import paddle
 
 
-class TestTVMFFI(unittest.TestCase):
+class TestTVMFFIEnvStream(unittest.TestCase):
     def test_tvm_ffi_env_stream_for_gpu_tensor(self):
         if not paddle.is_compiled_with_cuda():
             return
@@ -34,5 +38,103 @@ def test_tvm_ffi_env_stream_for_cpu_tensor(self):
             tensor.__tvm_ffi_env_stream__()
 
 
+class TestCDLPackExchangeAPI(unittest.TestCase):
+    def test_c_dlpack_exchange_api_cpu(self):
+        cpp_source = r"""
+            void add_one_cpu(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+                // implementation of a library function
+                TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+                DLDataType f32_dtype{kDLFloat, 32, 1};
+                TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+                TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+                TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+                TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+                for (int i = 0; i < x->shape[0]; ++i) {
+                    static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
+                }
+            }
+        """
+
+        mod: Module = tvm_ffi.cpp.load_inline(
+            name='mod', cpp_sources=cpp_source, functions='add_one_cpu'
+        )
+
+        x = paddle.full((3,), 1.0, dtype='float32').cpu()
+        y = paddle.zeros((3,), dtype='float32').cpu()
+        mod.add_one_cpu(x, y)
+        np.testing.assert_allclose(y.numpy(), [2.0, 2.0, 2.0])
+
+    def test_c_dlpack_exchange_api_gpu(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        cpp_sources = r"""
+            void add_one_cuda(tvm::ffi::TensorView x, tvm::ffi::TensorView y);
+        """
+        cuda_sources = r"""
+            __global__ void AddOneKernel(float* x, float* y, int n) {
+              int idx = blockIdx.x * blockDim.x + threadIdx.x;
+              if (idx < n) {
+                y[idx] = x[idx] + 1;
+              }
+            }
+
+            void add_one_cuda(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+              // implementation of a library function
+              TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+              DLDataType f32_dtype{kDLFloat, 32, 1};
+              TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+              TVM_FFI_ICHECK(y->ndim == 1) << "y must be a 1D tensor";
+              TVM_FFI_ICHECK(y->dtype == f32_dtype) << "y must be a float tensor";
+              TVM_FFI_ICHECK(x->shape[0] == y->shape[0]) << "x and y must have the same shape";
+
+              int64_t n = x->shape[0];
+              int64_t nthread_per_block = 256;
+              int64_t nblock = (n + nthread_per_block - 1) / nthread_per_block;
+              // Obtain the current stream from the environment by calling TVMFFIEnvGetStream
+              cudaStream_t stream = static_cast<cudaStream_t>(
+                  TVMFFIEnvGetStream(x->device.device_type, x->device.device_id));
+              // launch the kernel
+              AddOneKernel<<<nblock, nthread_per_block, 0, stream>>>(static_cast<float*>(x->data),
+                                                                     static_cast<float*>(y->data), n);
+            }
+        """
+        mod: Module = tvm_ffi.cpp.load_inline(
+            name='mod',
+            cpp_sources=cpp_sources,
+            cuda_sources=cuda_sources,
+            functions=['add_one_cuda'],
+        )
+
+        x = paddle.full((3,), 1.0, dtype='float32').cuda()
+        y = paddle.zeros((3,), dtype='float32').cuda()
+        mod.add_one_cuda(x, y)
+        np.testing.assert_allclose(y.numpy(), [2.0, 2.0, 2.0])
+
+    def test_c_dlpack_exchange_api_alloc_tensor(self):
+        cpp_source = r"""
+            inline tvm::ffi::Tensor alloc_tensor(tvm::ffi::Shape shape, DLDataType dtype, DLDevice device) {
+                return tvm::ffi::Tensor::FromDLPackAlloc(TVMFFIEnvGetTensorAllocator(), shape, dtype, device);
+            }
+
+            tvm::ffi::Tensor add_one_cpu(tvm::ffi::TensorView x) {
+                TVM_FFI_ICHECK(x->ndim == 1) << "x must be a 1D tensor";
+                DLDataType f32_dtype{kDLFloat, 32, 1};
+                TVM_FFI_ICHECK(x->dtype == f32_dtype) << "x must be a float tensor";
+                tvm::ffi::Shape x_shape(x->shape, x->shape + x->ndim);
+                tvm::ffi::Tensor y = alloc_tensor(x_shape, f32_dtype, x->device);
+                for (int i = 0; i < x->shape[0]; ++i) {
+                    static_cast<float*>(y->data)[i] = static_cast<float*>(x->data)[i] + 1;
+                }
+                return y;
+            }
+        """
+        mod: Module = tvm_ffi.cpp.load_inline(
+            name='mod', cpp_sources=cpp_source, functions=['add_one_cpu']
+        )
+        x = paddle.full((3,), 1.0, dtype='float32').cpu()
+        y = mod.add_one_cpu(x)
+        np.testing.assert_allclose(y.numpy(), [2.0, 2.0, 2.0])
+
+
 if __name__ == '__main__':
     unittest.main()

From c702c5ef691071a3de962dd2c6f47c689b10e8f5 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Sun, 12 Oct 2025 06:15:20 +0000
Subject: [PATCH 06/17] refine ut style

---
 test/legacy_test/test_tvm_ffi.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/legacy_test/test_tvm_ffi.py b/test/legacy_test/test_tvm_ffi.py
index 1e95a5db412902..c1cfef4a44d95e 100644
--- a/test/legacy_test/test_tvm_ffi.py
+++ b/test/legacy_test/test_tvm_ffi.py
@@ -12,14 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
+from typing import TYPE_CHECKING
 
 import numpy as np
 import tvm_ffi.cpp
-from tvm_ffi import Module
 
 import paddle
 
+if TYPE_CHECKING:
+    from tvm_ffi import Module
+
 
 class TestTVMFFIEnvStream(unittest.TestCase):
     def test_tvm_ffi_env_stream_for_gpu_tensor(self):

From bc16a585e58111d10c566998998954678fbb62a7 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Mon, 13 Oct 2025 09:27:23 +0800
Subject: [PATCH 07/17] pin tvm-ffi to new release 0.1.0b16

---
 python/unittest_py/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 9d547fd9357d1d..0ccf6d98680f22 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -20,4 +20,4 @@ xdoctest==1.3.0
 ubelt==1.3.3 # just for xdoctest
 mypy==1.17.1
 soundfile
-apache-tvm-ffi @ git+https://github.com/apache/tvm-ffi.git@22a78943b78306a73011757fa635afa9dce35114
+apache-tvm-ffi==0.1.0b16

From d0c21455bf2e5f5bef8b359144de59f6d3b11dcc Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Mon, 13 Oct 2025 06:49:16 +0000
Subject: [PATCH 08/17] skip 2cases on windows

---
 test/legacy_test/test_tvm_ffi.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/legacy_test/test_tvm_ffi.py b/test/legacy_test/test_tvm_ffi.py
index c1cfef4a44d95e..917e8e924b6956 100644
--- a/test/legacy_test/test_tvm_ffi.py
+++ b/test/legacy_test/test_tvm_ffi.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import platform
 import unittest
 from typing import TYPE_CHECKING
 
@@ -72,6 +73,9 @@ def test_c_dlpack_exchange_api_cpu(self):
     def test_c_dlpack_exchange_api_gpu(self):
         if not paddle.is_compiled_with_cuda():
             return
+        if platform.system() == "Windows":
+            # Temporary skip this test case on windows because compile bug on TVM FFI
+            return
         cpp_sources = r"""
             void add_one_cuda(tvm::ffi::TensorView x, tvm::ffi::TensorView y);
         """
@@ -116,6 +120,10 @@ def test_c_dlpack_exchange_api_gpu(self):
         np.testing.assert_allclose(y.numpy(), [2.0, 2.0, 2.0])
 
     def test_c_dlpack_exchange_api_alloc_tensor(self):
+        if platform.system() == "Windows":
+            # Temporary skip this test case on windows because return owned tensor created by
+            # TVMFFIEnvGetTensorAllocator will cause double free error
+            return
         cpp_source = r"""
             inline tvm::ffi::Tensor alloc_tensor(tvm::ffi::Shape shape, DLDataType dtype, DLDevice device) {
                 return tvm::ffi::Tensor::FromDLPackAlloc(TVMFFIEnvGetTensorAllocator(), shape, dtype, device);

From 27e18e758380220d984e9f3c4041806bc1582dd8 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Mon, 13 Oct 2025 06:53:23 +0000
Subject: [PATCH 09/17] install python/unittest_py/requirements.txt in dcu
 workflow

---
 .github/workflows/_Linux-DCU.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/_Linux-DCU.yml b/.github/workflows/_Linux-DCU.yml
index 63008000cf5af6..ba82f790b35f61 100644
--- a/.github/workflows/_Linux-DCU.yml
+++ b/.github/workflows/_Linux-DCU.yml
@@ -290,6 +290,7 @@ jobs:
           ln -sf $(which python3.10) /usr/local/bin/python
           ln -sf $(which pip3.10) /usr/local/bin/pip
           pip3.10 install ./dist/paddlepaddle_dcu-0.0.0-cp310-cp310-linux_x86_64.whl
+          pip3.10 install -r python/unittest_py/requirements.txt
           wget -q --no-proxy https://paddle-device.bj.bcebos.com/dcu/hyhal-Z100.tar.gz
           tar -zxf hyhal-Z100.tar.gz -C /opt
           source /opt/dtk-24.04.1/env.sh

From ffe9d24194b1ba7b63f825a155051b0feeaa054a Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Mon, 13 Oct 2025 12:21:36 +0000
Subject: [PATCH 10/17] add a blank line to trigger docker rebuild

---
 tools/dockerfile/Dockerfile.develop.dtk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/dockerfile/Dockerfile.develop.dtk b/tools/dockerfile/Dockerfile.develop.dtk
index 8426d8282a7f25..90e0b3069c70cd 100644
--- a/tools/dockerfile/Dockerfile.develop.dtk
+++ b/tools/dockerfile/Dockerfile.develop.dtk
@@ -105,6 +105,7 @@ RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6
     make -j16 > /dev/null && make install > /dev/null && \
     cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
     ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
+
 ENV CCACHE_MAXSIZE=50G \
     CCACHE_LIMIT_MULTIPLE=0.8 \
     CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime

From 489f02dadf7cbc50b5b558f0ef4c41ff1fbb59e5 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Mon, 13 Oct 2025 23:32:47 +0800
Subject: [PATCH 11/17] update other dockerfiles to trigger requirements.txt
 install

---
 tools/dockerfile/Dockerfile.develop.npu | 1 +
 tools/dockerfile/Dockerfile.develop.xre | 1 +
 tools/dockerfile/Dockerfile.ubuntu20    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tools/dockerfile/Dockerfile.develop.npu b/tools/dockerfile/Dockerfile.develop.npu
index f0ad07ec9b90be..b03b23c6c88f3f 100644
--- a/tools/dockerfile/Dockerfile.develop.npu
+++ b/tools/dockerfile/Dockerfile.develop.npu
@@ -12,6 +12,7 @@ RUN groupadd -g 1000 HwHiAiUser && \
     useradd -u 1000 -g 1000 -m -d /home/HwHiAiUser HwHiAiUser
 
 RUN mkdir -p /usr/local/Ascend/driver
+
 WORKDIR /usr/local/Ascend
 
 # install CANN requirement
diff --git a/tools/dockerfile/Dockerfile.develop.xre b/tools/dockerfile/Dockerfile.develop.xre
index 2bdbe56d7cde1e..422a6e8b667545 100644
--- a/tools/dockerfile/Dockerfile.develop.xre
+++ b/tools/dockerfile/Dockerfile.develop.xre
@@ -76,6 +76,7 @@ RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6
     make -j16 > /dev/null && make install > /dev/null && \
     cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
     ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
+
 ENV CCACHE_MAXSIZE=80G \
     CCACHE_LIMIT_MULTIPLE=0.8 \
     CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime
diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20
index fc5b56f3c6ec5a..6c6f73c2a6d9d9 100644
--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
@@ -99,6 +99,7 @@ RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.li
     mkdir /root/gopath && \
     mkdir /root/gopath/bin && \
     mkdir /root/gopath/src
+
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin

From 597c51938786acd2b524ed6143d131e53e4b1ba1 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 00:45:02 +0800
Subject: [PATCH 12/17] Revert "update other dockerfiles to trigger
 requirements.txt install"

This reverts commit 2a9fc214cce8414c809bc60bda2139e0c9562538.
---
 tools/dockerfile/Dockerfile.develop.npu | 1 -
 tools/dockerfile/Dockerfile.develop.xre | 1 -
 tools/dockerfile/Dockerfile.ubuntu20    | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.develop.npu b/tools/dockerfile/Dockerfile.develop.npu
index b03b23c6c88f3f..f0ad07ec9b90be 100644
--- a/tools/dockerfile/Dockerfile.develop.npu
+++ b/tools/dockerfile/Dockerfile.develop.npu
@@ -12,7 +12,6 @@ RUN groupadd -g 1000 HwHiAiUser && \
     useradd -u 1000 -g 1000 -m -d /home/HwHiAiUser HwHiAiUser
 
 RUN mkdir -p /usr/local/Ascend/driver
-
 WORKDIR /usr/local/Ascend
 
 # install CANN requirement
diff --git a/tools/dockerfile/Dockerfile.develop.xre b/tools/dockerfile/Dockerfile.develop.xre
index 422a6e8b667545..2bdbe56d7cde1e 100644
--- a/tools/dockerfile/Dockerfile.develop.xre
+++ b/tools/dockerfile/Dockerfile.develop.xre
@@ -76,7 +76,6 @@ RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6
     make -j16 > /dev/null && make install > /dev/null && \
     cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
     ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
-
 ENV CCACHE_MAXSIZE=80G \
     CCACHE_LIMIT_MULTIPLE=0.8 \
     CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime
diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20
index 6c6f73c2a6d9d9..fc5b56f3c6ec5a 100644
--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
@@ -99,7 +99,6 @@ RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.li
     mkdir /root/gopath && \
     mkdir /root/gopath/bin && \
     mkdir /root/gopath/src
-
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin

From 65d1e1423fa19a272eaccd7dd0bbbb187a05af70 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 00:45:06 +0800
Subject: [PATCH 13/17] Revert "add a blank line to trigger docker rebuild"

This reverts commit 89ce02d3685dd0c9171ec0db9d62795c1304d5ee.
---
 tools/dockerfile/Dockerfile.develop.dtk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/dockerfile/Dockerfile.develop.dtk b/tools/dockerfile/Dockerfile.develop.dtk
index 90e0b3069c70cd..8426d8282a7f25 100644
--- a/tools/dockerfile/Dockerfile.develop.dtk
+++ b/tools/dockerfile/Dockerfile.develop.dtk
@@ -105,7 +105,6 @@ RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6
     make -j16 > /dev/null && make install > /dev/null && \
     cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
     ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
-
 ENV CCACHE_MAXSIZE=50G \
     CCACHE_LIMIT_MULTIPLE=0.8 \
     CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime

From b4a4a348a978e9e426b2c090ba998b3142811c18 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 00:45:23 +0800
Subject: [PATCH 14/17] Revert "install python/unittest_py/requirements.txt in
 dcu workflow"

This reverts commit aefca0e95b441e3d1abbd71e5f657ea287381bc5.
---
 .github/workflows/_Linux-DCU.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/_Linux-DCU.yml b/.github/workflows/_Linux-DCU.yml
index ba82f790b35f61..63008000cf5af6 100644
--- a/.github/workflows/_Linux-DCU.yml
+++ b/.github/workflows/_Linux-DCU.yml
@@ -290,7 +290,6 @@ jobs:
           ln -sf $(which python3.10) /usr/local/bin/python
           ln -sf $(which pip3.10) /usr/local/bin/pip
           pip3.10 install ./dist/paddlepaddle_dcu-0.0.0-cp310-cp310-linux_x86_64.whl
-          pip3.10 install -r python/unittest_py/requirements.txt
           wget -q --no-proxy https://paddle-device.bj.bcebos.com/dcu/hyhal-Z100.tar.gz
           tar -zxf hyhal-Z100.tar.gz -C /opt
           source /opt/dtk-24.04.1/env.sh

From c1735e41d5c4e947767b6813ae4f5b07092e9d2e Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 00:46:20 +0800
Subject: [PATCH 15/17] install python/unittest_py/requirements.txt in
 ci/dcu_test.sh

---
 ci/dcu_test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/dcu_test.sh b/ci/dcu_test.sh
index be2d0e96369c75..7b4bbca440a4f7 100644
--- a/ci/dcu_test.sh
+++ b/ci/dcu_test.sh
@@ -75,6 +75,7 @@ function hybrid_paddlex() {
 function main(){
     cd ${PADDLE_ROOT}/build
     pip install hypothesis
+    pip install -r ${PADDLE_ROOT}/python/unittest_py/requirements.txt
     /opt/py310/bin/pip install safetensors
     if ls ${PADDLE_ROOT}/build/python/dist/*whl >/dev/null 2>&1; then
         pip install ${PADDLE_ROOT}/build/python/dist/*whl

From be4654b8e0af5bcaab3dbebf7776ce6f2ec7c808 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 08:34:53 +0000
Subject: [PATCH 16/17] install requirements with `/opt/py310/bin/pip` in ci

---
 ci/dcu_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/dcu_test.sh b/ci/dcu_test.sh
index 7b4bbca440a4f7..cc303f5466ea50 100644
--- a/ci/dcu_test.sh
+++ b/ci/dcu_test.sh
@@ -75,7 +75,7 @@ function hybrid_paddlex() {
 function main(){
     cd ${PADDLE_ROOT}/build
     pip install hypothesis
-    pip install -r ${PADDLE_ROOT}/python/unittest_py/requirements.txt
+    /opt/py310/bin/pip install -r ${PADDLE_ROOT}/python/unittest_py/requirements.txt
     /opt/py310/bin/pip install safetensors
     if ls ${PADDLE_ROOT}/build/python/dist/*whl >/dev/null 2>&1; then
         pip install ${PADDLE_ROOT}/build/python/dist/*whl

From 567c660753f0fc4c35d674761b20735b0d9957f2 Mon Sep 17 00:00:00 2001
From: SigureMo <sigure.qaq@gmail.com>
Date: Tue, 14 Oct 2025 22:00:08 +0800
Subject: [PATCH 17/17] skip gpu case on DCU

---
 test/legacy_test/test_tvm_ffi.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/legacy_test/test_tvm_ffi.py b/test/legacy_test/test_tvm_ffi.py
index 917e8e924b6956..ce1a955932ebe4 100644
--- a/test/legacy_test/test_tvm_ffi.py
+++ b/test/legacy_test/test_tvm_ffi.py
@@ -73,6 +73,9 @@ def test_c_dlpack_exchange_api_cpu(self):
     def test_c_dlpack_exchange_api_gpu(self):
         if not paddle.is_compiled_with_cuda():
             return
+        if paddle.is_compiled_with_rocm():
+            # Skip on DCU because CUDA_HOME is not available
+            return
         if platform.system() == "Windows":
             # Temporary skip this test case on windows because compile bug on TVM FFI
             return