Skip to content

Commit ed39eb4

Browse files
committed
Add async allocations to L0 adapter v2
1 parent 53fced1 commit ed39eb4

File tree

11 files changed

+251
-58
lines changed

11 files changed

+251
-58
lines changed

unified-runtime/source/adapters/level_zero/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
134134
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
135135
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
136136
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
137+
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
137138
${CMAKE_CURRENT_SOURCE_DIR}/image_common.cpp
138139
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
139140
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp

unified-runtime/source/adapters/level_zero/enqueued_pool.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
EnqueuedPool::~EnqueuedPool() { cleanup(); }
1717

1818
std::optional<EnqueuedPool::Allocation>
19-
EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
20-
ur_queue_handle_t Queue) {
19+
EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) {
2120
auto Lock = std::lock_guard(Mutex);
2221

2322
Allocation Alloc = {nullptr, Size, nullptr, Queue, Alignment};
@@ -47,7 +46,7 @@ EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
4746
}
4847

4948
void EnqueuedPool::insert(void *Ptr, size_t Size, ur_event_handle_t Event,
50-
ur_queue_handle_t Queue) {
49+
void *Queue) {
5150
auto Lock = std::lock_guard(Mutex);
5251

5352
uintptr_t Address = (uintptr_t)Ptr;
@@ -67,14 +66,14 @@ bool EnqueuedPool::cleanup() {
6766
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It.Ptr);
6867
assert(umfRet == UMF_RESULT_SUCCESS);
6968

70-
urEventReleaseInternal(It.Event);
69+
eventRelease(It.Event);
7170
}
7271
Freelist.clear();
7372

7473
return FreedAllocations;
7574
}
7675

77-
bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
76+
bool EnqueuedPool::cleanupForQueue(void *Queue) {
7877
auto Lock = std::lock_guard(Mutex);
7978

8079
Allocation Alloc = {nullptr, 0, nullptr, Queue, 0};
@@ -90,7 +89,7 @@ bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
9089
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It->Ptr);
9190
assert(umfRet == UMF_RESULT_SUCCESS);
9291

93-
urEventReleaseInternal(It->Event);
92+
eventRelease(It->Event);
9493

9594
// Erase the current allocation and move to the next one
9695
It = Freelist.erase(It);

unified-runtime/source/adapters/level_zero/enqueued_pool.hpp

+11-5
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,23 @@ class EnqueuedPool {
2222
void *Ptr;
2323
size_t Size;
2424
ur_event_handle_t Event;
25-
ur_queue_handle_t Queue;
25+
// Queue handle, used as an identifier for the associated queue.
26+
// This can either be a `ur_queue_handle_t` or a pointer to a v2 queue
27+
// object.
28+
void *Queue;
2629
size_t Alignment;
2730
};
2831

32+
EnqueuedPool(ur_result_t (*eventRelease)(ur_event_handle_t))
33+
: eventRelease(eventRelease) {}
34+
2935
~EnqueuedPool();
3036
std::optional<Allocation> getBestFit(size_t Size, size_t Alignment,
31-
ur_queue_handle_t Queue);
32-
void insert(void *Ptr, size_t Size, ur_event_handle_t Event,
33-
ur_queue_handle_t Queue);
37+
void *Queue);
38+
void insert(void *Ptr, size_t Size, ur_event_handle_t Event, void *Queue);
3439
bool cleanup();
35-
bool cleanupForQueue(ur_queue_handle_t Queue);
40+
bool cleanupForQueue(void *Queue);
41+
ur_result_t (*eventRelease)(ur_event_handle_t);
3642

3743
private:
3844
struct Comparator {

unified-runtime/source/adapters/level_zero/usm.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "common.hpp"
1313

1414
#include "enqueued_pool.hpp"
15+
#include "event.hpp"
1516
#include "ur_api.h"
1617
#include "ur_pool_manager.hpp"
1718
#include <set>
@@ -20,7 +21,10 @@
2021
usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig();
2122

2223
struct UsmPool {
23-
UsmPool(umf::pool_unique_handle_t Pool) : UmfPool(std::move(Pool)) {}
24+
UsmPool(umf::pool_unique_handle_t Pool)
25+
: UmfPool(std::move(Pool)), AsyncPool([](ur_event_handle_t Event) {
26+
return urEventReleaseInternal(Event);
27+
}) {}
2428
umf::pool_unique_handle_t UmfPool;
2529
EnqueuedPool AsyncPool;
2630
};

unified-runtime/source/adapters/level_zero/v2/context.cpp

+15-11
Original file line numberDiff line numberDiff line change
@@ -52,22 +52,22 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
5252
hDevices(phDevices, phDevices + numDevices),
5353
commandListCache(hContext,
5454
phDevices[0]->Platform->ZeCopyOffloadExtensionSupported),
55-
eventPoolCache(this, phDevices[0]->Platform->getNumDevices(),
56-
[context = this](DeviceId /* deviceId*/,
57-
v2::event_flags_t flags)
58-
-> std::unique_ptr<v2::event_provider> {
59-
assert((flags & v2::EVENT_FLAGS_COUNTER) != 0);
60-
61-
// TODO: just use per-context id?
62-
return std::make_unique<v2::provider_normal>(
63-
context, v2::QUEUE_IMMEDIATE, flags);
64-
}),
55+
eventPoolCache(
56+
this, phDevices[0]->Platform->getNumDevices(),
57+
[context = this](DeviceId /* deviceId*/, v2::event_flags_t flags)
58+
-> std::unique_ptr<v2::event_provider> {
59+
assert((flags & v2::EVENT_FLAGS_COUNTER) != 0);
60+
61+
// TODO: just use per-context id?
62+
return std::make_unique<v2::provider_normal>(
63+
context, v2::QUEUE_IMMEDIATE, flags);
64+
}),
6565
nativeEventsPool(this, std::make_unique<v2::provider_normal>(
6666
this, v2::QUEUE_IMMEDIATE,
6767
v2::EVENT_FLAGS_PROFILING_ENABLED)),
6868
p2pAccessDevices(populateP2PDevices(
6969
phDevices[0]->Platform->getNumDevices(), this->hDevices)),
70-
defaultUSMPool(this, nullptr) {}
70+
defaultUSMPool(this, nullptr), asyncPool(this, nullptr) {}
7171

7272
ur_result_t ur_context_handle_t_::retain() {
7373
RefCount.increment();
@@ -78,6 +78,8 @@ ur_result_t ur_context_handle_t_::release() {
7878
if (!RefCount.decrementAndTest())
7979
return UR_RESULT_SUCCESS;
8080

81+
asyncPool.cleanupPools();
82+
8183
delete this;
8284
return UR_RESULT_SUCCESS;
8385
}
@@ -104,6 +106,8 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() {
104106
return &defaultUSMPool;
105107
}
106108

109+
ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { return &asyncPool; }
110+
107111
const std::vector<ur_device_handle_t> &
108112
ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const {
109113
return p2pAccessDevices[hDevice->Id.value()];

unified-runtime/source/adapters/level_zero/v2/context.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct ur_context_handle_t_ : ur_object {
2929

3030
const std::vector<ur_device_handle_t> &getDevices() const;
3131
ur_usm_pool_handle_t getDefaultUSMPool();
32+
ur_usm_pool_handle_t getAsyncPool();
3233

3334
const std::vector<ur_device_handle_t> &
3435
getP2PDevices(ur_device_handle_t hDevice) const;
@@ -55,4 +56,6 @@ struct ur_context_handle_t_ : ur_object {
5556
const std::vector<std::vector<ur_device_handle_t>> p2pAccessDevices;
5657

5758
ur_usm_pool_handle_t_ defaultUSMPool;
59+
60+
ur_usm_pool_handle_t_ asyncPool;
5861
};

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

+129-16
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
150150
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
151151
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
152152

153+
hContext->getAsyncPool()->cleanupPoolsForQueue(this);
154+
153155
auto commandListLocked = commandListManager.lock();
154156
// TODO: use zeEventHostSynchronize instead?
155157
TRACK_SCOPE_LATENCY(
@@ -703,31 +705,142 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
703705
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
704706
}
705707

708+
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper(
709+
ur_usm_pool_handle_t pPool, const size_t size,
710+
const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
711+
const ur_event_handle_t *phEventWaitList, void **ppMem,
712+
ur_event_handle_t *phEvent, ur_usm_type_t type) {
713+
auto commandListLocked = commandListManager.lock();
714+
715+
if (!pPool) {
716+
pPool = hContext->getAsyncPool();
717+
}
718+
719+
auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;
720+
auto waitListView =
721+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
722+
723+
auto asyncAlloc =
724+
pPool->allocateEnqueued(hContext, this, device, nullptr, type, size);
725+
if (!asyncAlloc) {
726+
auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem);
727+
if (Ret) {
728+
return Ret;
729+
}
730+
} else {
731+
ur_event_handle_t originAllocEvent;
732+
std::tie(*ppMem, originAllocEvent) = *asyncAlloc;
733+
waitListView = getWaitListView(commandListLocked, phEventWaitList,
734+
numEventsInWaitList, originAllocEvent);
735+
}
736+
737+
ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
738+
switch (type) {
739+
case UR_USM_TYPE_HOST:
740+
commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
741+
break;
742+
case UR_USM_TYPE_DEVICE:
743+
commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
744+
break;
745+
case UR_USM_TYPE_SHARED:
746+
commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
747+
break;
748+
default:
749+
logger::error("enqueueUSMAllocHelper: unsupported USM type");
750+
throw UR_RESULT_ERROR_UNKNOWN;
751+
}
752+
753+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType);
754+
auto [pWaitEvents, numWaitEvents] = waitListView;
755+
if (numWaitEvents > 0) {
756+
ZE2UR_CALL(
757+
zeCommandListAppendWaitOnEvents,
758+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
759+
}
760+
if (zeSignalEvent) {
761+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
762+
(commandListLocked->getZeCommandList(), zeSignalEvent));
763+
}
764+
765+
return UR_RESULT_SUCCESS;
766+
}
767+
706768
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp(
707-
ur_usm_pool_handle_t, const size_t,
708-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
709-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
710-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
769+
ur_usm_pool_handle_t pPool, const size_t size,
770+
const ur_exp_async_usm_alloc_properties_t *pProperties,
771+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
772+
void **ppMem, ur_event_handle_t *phEvent) {
773+
TRACK_SCOPE_LATENCY(
774+
"ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp");
775+
776+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
777+
phEventWaitList, ppMem, phEvent,
778+
UR_USM_TYPE_DEVICE);
711779
}
712780

713781
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp(
714-
ur_usm_pool_handle_t, const size_t,
715-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
716-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
717-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
782+
ur_usm_pool_handle_t pPool, const size_t size,
783+
const ur_exp_async_usm_alloc_properties_t *pProperties,
784+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
785+
void **ppMem, ur_event_handle_t *phEvent) {
786+
TRACK_SCOPE_LATENCY(
787+
"ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp");
788+
789+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
790+
phEventWaitList, ppMem, phEvent,
791+
UR_USM_TYPE_SHARED);
718792
}
719793

720794
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp(
721-
ur_usm_pool_handle_t, const size_t,
722-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
723-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
724-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
795+
ur_usm_pool_handle_t pPool, const size_t size,
796+
const ur_exp_async_usm_alloc_properties_t *pProperties,
797+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
798+
void **ppMem, ur_event_handle_t *phEvent) {
799+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp");
800+
801+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
802+
phEventWaitList, ppMem, phEvent,
803+
UR_USM_TYPE_HOST);
725804
}
726805

727806
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp(
728-
ur_usm_pool_handle_t, void *, uint32_t, const ur_event_handle_t *,
729-
ur_event_handle_t *) {
730-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
807+
ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList,
808+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
809+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFreeExp");
810+
auto commandListLocked = commandListManager.lock();
811+
812+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent,
813+
UR_COMMAND_ENQUEUE_USM_FREE_EXP);
814+
auto [pWaitEvents, numWaitEvents] =
815+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
816+
817+
umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem);
818+
if (!hPool) {
819+
return UR_RESULT_ERROR_INVALID_MEM_OBJECT
820+
}
821+
822+
UsmPool *usmPool = nullptr;
823+
auto ret = umfPoolGetTag(hPool, (void **)&usmPool);
824+
if (ret != UR_RESULT_SUCCESS || !usmPool) {
825+
// This should never happen
826+
return UR_RESULT_ERROR_UNKNOWN;
827+
}
828+
829+
size_t size = umfPoolMallocUsableSize(hPool, pMem);
830+
usmPool->asyncPool.insert(pMem, size, *phEvent, this);
831+
832+
if (numWaitEvents > 0) {
833+
ZE2UR_CALL(
834+
zeCommandListAppendWaitOnEvents,
835+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
836+
}
837+
838+
if (zeSignalEvent) {
839+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
840+
(commandListLocked->getZeCommandList(), zeSignalEvent));
841+
}
842+
843+
return UR_RESULT_SUCCESS;
731844
}
732845

733846
ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp(
@@ -866,9 +979,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
866979
"ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp");
867980

868981
auto commandListLocked = commandListManager.lock();
982+
869983
auto zeSignalEvent =
870984
getSignalEvent(commandListLocked, phEvent, callerCommand);
871-
872985
auto [pWaitEvents, numWaitEvents] =
873986
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList,
874987
additionalWaitEvent);

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ {
6464

6565
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6666

67+
ur_result_t
68+
enqueueUSMAllocHelper(ur_usm_pool_handle_t pPool, const size_t size,
69+
const ur_exp_async_usm_alloc_properties_t *pProperties,
70+
uint32_t numEventsInWaitList,
71+
const ur_event_handle_t *phEventWaitList, void **ppMem,
72+
ur_event_handle_t *phEvent, ur_usm_type_t Type);
73+
6774
public:
6875
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6976
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)