Skip to content

Commit bcde052

Browse files
committed
Add async allocations to L0 adapter v2
1 parent 53fced1 commit bcde052

File tree

13 files changed

+261
-62
lines changed

13 files changed

+261
-62
lines changed
-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
11
config.required_features += ['aspect-ext_oneapi_async_memory_alloc']
2-
# V2 adapter does not support async alloc api yet
3-
config.unsupported_features += ['level_zero_v2_adapter']

unified-runtime/source/adapters/level_zero/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
134134
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
135135
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
136136
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
137+
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
137138
${CMAKE_CURRENT_SOURCE_DIR}/image_common.cpp
138139
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
139140
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp

unified-runtime/source/adapters/level_zero/async_alloc.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ ur_result_t urEnqueueUSMFreeExp(
247247
}
248248

249249
size_t size = umfPoolMallocUsableSize(hPool, Mem);
250+
Event->RefCount.increment();
250251
usmPool->AsyncPool.insert(Mem, size, *Event, Queue);
251252

252253
// Signal that USM free event was finished

unified-runtime/source/adapters/level_zero/enqueued_pool.cpp

+7-8
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,13 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "enqueued_pool.hpp"
12-
#include "event.hpp"
1312

1413
#include <ur_api.h>
1514

1615
EnqueuedPool::~EnqueuedPool() { cleanup(); }
1716

1817
std::optional<EnqueuedPool::Allocation>
19-
EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
20-
ur_queue_handle_t Queue) {
18+
EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) {
2119
auto Lock = std::lock_guard(Mutex);
2220

2321
Allocation Alloc = {nullptr, Size, nullptr, Queue, Alignment};
@@ -47,12 +45,11 @@ EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
4745
}
4846

4947
void EnqueuedPool::insert(void *Ptr, size_t Size, ur_event_handle_t Event,
50-
ur_queue_handle_t Queue) {
48+
void *Queue) {
5149
auto Lock = std::lock_guard(Mutex);
5250

5351
uintptr_t Address = (uintptr_t)Ptr;
5452
size_t Alignment = Address & (~Address + 1);
55-
Event->RefCount.increment();
5653

5754
Freelist.emplace(Allocation{Ptr, Size, Event, Queue, Alignment});
5855
}
@@ -67,14 +64,15 @@ bool EnqueuedPool::cleanup() {
6764
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It.Ptr);
6865
assert(umfRet == UMF_RESULT_SUCCESS);
6966

70-
urEventReleaseInternal(It.Event);
67+
if (It.Event)
68+
eventRelease(It.Event);
7169
}
7270
Freelist.clear();
7371

7472
return FreedAllocations;
7573
}
7674

77-
bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
75+
bool EnqueuedPool::cleanupForQueue(void *Queue) {
7876
auto Lock = std::lock_guard(Mutex);
7977

8078
Allocation Alloc = {nullptr, 0, nullptr, Queue, 0};
@@ -90,7 +88,8 @@ bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
9088
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It->Ptr);
9189
assert(umfRet == UMF_RESULT_SUCCESS);
9290

93-
urEventReleaseInternal(It->Event);
91+
if (It->Event)
92+
eventRelease(It->Event);
9493

9594
// Erase the current allocation and move to the next one
9695
It = Freelist.erase(It);

unified-runtime/source/adapters/level_zero/enqueued_pool.hpp

+11-5
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,23 @@ class EnqueuedPool {
2222
void *Ptr;
2323
size_t Size;
2424
ur_event_handle_t Event;
25-
ur_queue_handle_t Queue;
25+
// Queue handle, used as an identifier for the associated queue.
26+
// This can either be a `ur_queue_handle_t` or a pointer to a v2 queue
27+
// object.
28+
void *Queue;
2629
size_t Alignment;
2730
};
2831

32+
EnqueuedPool(ur_result_t (*eventRelease)(ur_event_handle_t))
33+
: eventRelease(eventRelease) {}
34+
2935
~EnqueuedPool();
3036
std::optional<Allocation> getBestFit(size_t Size, size_t Alignment,
31-
ur_queue_handle_t Queue);
32-
void insert(void *Ptr, size_t Size, ur_event_handle_t Event,
33-
ur_queue_handle_t Queue);
37+
void *Queue);
38+
void insert(void *Ptr, size_t Size, ur_event_handle_t Event, void *Queue);
3439
bool cleanup();
35-
bool cleanupForQueue(ur_queue_handle_t Queue);
40+
bool cleanupForQueue(void *Queue);
41+
ur_result_t (*eventRelease)(ur_event_handle_t);
3642

3743
private:
3844
struct Comparator {

unified-runtime/source/adapters/level_zero/usm.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "common.hpp"
1313

1414
#include "enqueued_pool.hpp"
15+
#include "event.hpp"
1516
#include "ur_api.h"
1617
#include "ur_pool_manager.hpp"
1718
#include <set>
@@ -20,7 +21,10 @@
2021
usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig();
2122

2223
struct UsmPool {
23-
UsmPool(umf::pool_unique_handle_t Pool) : UmfPool(std::move(Pool)) {}
24+
UsmPool(umf::pool_unique_handle_t Pool)
25+
: UmfPool(std::move(Pool)), AsyncPool([](ur_event_handle_t Event) {
26+
return urEventReleaseInternal(Event);
27+
}) {}
2428
umf::pool_unique_handle_t UmfPool;
2529
EnqueuedPool AsyncPool;
2630
};

unified-runtime/source/adapters/level_zero/v2/context.cpp

+15-11
Original file line numberDiff line numberDiff line change
@@ -52,22 +52,22 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
5252
hDevices(phDevices, phDevices + numDevices),
5353
commandListCache(hContext,
5454
phDevices[0]->Platform->ZeCopyOffloadExtensionSupported),
55-
eventPoolCache(this, phDevices[0]->Platform->getNumDevices(),
56-
[context = this](DeviceId /* deviceId*/,
57-
v2::event_flags_t flags)
58-
-> std::unique_ptr<v2::event_provider> {
59-
assert((flags & v2::EVENT_FLAGS_COUNTER) != 0);
60-
61-
// TODO: just use per-context id?
62-
return std::make_unique<v2::provider_normal>(
63-
context, v2::QUEUE_IMMEDIATE, flags);
64-
}),
55+
eventPoolCache(
56+
this, phDevices[0]->Platform->getNumDevices(),
57+
[context = this](DeviceId /* deviceId*/, v2::event_flags_t flags)
58+
-> std::unique_ptr<v2::event_provider> {
59+
assert((flags & v2::EVENT_FLAGS_COUNTER) != 0);
60+
61+
// TODO: just use per-context id?
62+
return std::make_unique<v2::provider_normal>(
63+
context, v2::QUEUE_IMMEDIATE, flags);
64+
}),
6565
nativeEventsPool(this, std::make_unique<v2::provider_normal>(
6666
this, v2::QUEUE_IMMEDIATE,
6767
v2::EVENT_FLAGS_PROFILING_ENABLED)),
6868
p2pAccessDevices(populateP2PDevices(
6969
phDevices[0]->Platform->getNumDevices(), this->hDevices)),
70-
defaultUSMPool(this, nullptr) {}
70+
defaultUSMPool(this, nullptr), asyncPool(this, nullptr) {}
7171

7272
ur_result_t ur_context_handle_t_::retain() {
7373
RefCount.increment();
@@ -78,6 +78,8 @@ ur_result_t ur_context_handle_t_::release() {
7878
if (!RefCount.decrementAndTest())
7979
return UR_RESULT_SUCCESS;
8080

81+
asyncPool.cleanupPools();
82+
8183
delete this;
8284
return UR_RESULT_SUCCESS;
8385
}
@@ -104,6 +106,8 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() {
104106
return &defaultUSMPool;
105107
}
106108

109+
ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { return &asyncPool; }
110+
107111
const std::vector<ur_device_handle_t> &
108112
ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const {
109113
return p2pAccessDevices[hDevice->Id.value()];

unified-runtime/source/adapters/level_zero/v2/context.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct ur_context_handle_t_ : ur_object {
2929

3030
const std::vector<ur_device_handle_t> &getDevices() const;
3131
ur_usm_pool_handle_t getDefaultUSMPool();
32+
ur_usm_pool_handle_t getAsyncPool();
3233

3334
const std::vector<ur_device_handle_t> &
3435
getP2PDevices(ur_device_handle_t hDevice) const;
@@ -55,4 +56,6 @@ struct ur_context_handle_t_ : ur_object {
5556
const std::vector<std::vector<ur_device_handle_t>> p2pAccessDevices;
5657

5758
ur_usm_pool_handle_t_ defaultUSMPool;
59+
60+
ur_usm_pool_handle_t_ asyncPool;
5861
};

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

+136-16
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
150150
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
151151
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
152152

153+
hContext->getAsyncPool()->cleanupPoolsForQueue(this);
154+
153155
auto commandListLocked = commandListManager.lock();
154156
// TODO: use zeEventHostSynchronize instead?
155157
TRACK_SCOPE_LATENCY(
@@ -703,31 +705,149 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
703705
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
704706
}
705707

708+
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper(
709+
ur_usm_pool_handle_t pPool, const size_t size,
710+
const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
711+
const ur_event_handle_t *phEventWaitList, void **ppMem,
712+
ur_event_handle_t *phEvent, ur_usm_type_t type) {
713+
auto commandListLocked = commandListManager.lock();
714+
715+
if (!pPool) {
716+
pPool = hContext->getAsyncPool();
717+
}
718+
719+
auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;
720+
auto waitListView =
721+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
722+
723+
auto asyncAlloc =
724+
pPool->allocateEnqueued(hContext, this, device, nullptr, type, size);
725+
if (!asyncAlloc) {
726+
auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem);
727+
if (Ret) {
728+
return Ret;
729+
}
730+
} else {
731+
ur_event_handle_t originAllocEvent;
732+
std::tie(*ppMem, originAllocEvent) = *asyncAlloc;
733+
if (originAllocEvent) {
734+
waitListView = getWaitListView(commandListLocked, phEventWaitList,
735+
numEventsInWaitList, originAllocEvent);
736+
}
737+
}
738+
739+
ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
740+
switch (type) {
741+
case UR_USM_TYPE_HOST:
742+
commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
743+
break;
744+
case UR_USM_TYPE_DEVICE:
745+
commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
746+
break;
747+
case UR_USM_TYPE_SHARED:
748+
commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
749+
break;
750+
default:
751+
logger::error("enqueueUSMAllocHelper: unsupported USM type");
752+
throw UR_RESULT_ERROR_UNKNOWN;
753+
}
754+
755+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType);
756+
auto [pWaitEvents, numWaitEvents] = waitListView;
757+
if (numWaitEvents > 0) {
758+
ZE2UR_CALL(
759+
zeCommandListAppendWaitOnEvents,
760+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
761+
}
762+
if (zeSignalEvent) {
763+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
764+
(commandListLocked->getZeCommandList(), zeSignalEvent));
765+
}
766+
767+
return UR_RESULT_SUCCESS;
768+
}
769+
706770
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp(
707-
ur_usm_pool_handle_t, const size_t,
708-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
709-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
710-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
771+
ur_usm_pool_handle_t pPool, const size_t size,
772+
const ur_exp_async_usm_alloc_properties_t *pProperties,
773+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
774+
void **ppMem, ur_event_handle_t *phEvent) {
775+
TRACK_SCOPE_LATENCY(
776+
"ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp");
777+
778+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
779+
phEventWaitList, ppMem, phEvent,
780+
UR_USM_TYPE_DEVICE);
711781
}
712782

713783
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp(
714-
ur_usm_pool_handle_t, const size_t,
715-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
716-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
717-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
784+
ur_usm_pool_handle_t pPool, const size_t size,
785+
const ur_exp_async_usm_alloc_properties_t *pProperties,
786+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
787+
void **ppMem, ur_event_handle_t *phEvent) {
788+
TRACK_SCOPE_LATENCY(
789+
"ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp");
790+
791+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
792+
phEventWaitList, ppMem, phEvent,
793+
UR_USM_TYPE_SHARED);
718794
}
719795

720796
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp(
721-
ur_usm_pool_handle_t, const size_t,
722-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
723-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
724-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
797+
ur_usm_pool_handle_t pPool, const size_t size,
798+
const ur_exp_async_usm_alloc_properties_t *pProperties,
799+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
800+
void **ppMem, ur_event_handle_t *phEvent) {
801+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp");
802+
803+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
804+
phEventWaitList, ppMem, phEvent,
805+
UR_USM_TYPE_HOST);
725806
}
726807

727808
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp(
728-
ur_usm_pool_handle_t, void *, uint32_t, const ur_event_handle_t *,
729-
ur_event_handle_t *) {
730-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
809+
ur_usm_pool_handle_t, void *pMem, uint32_t numEventsInWaitList,
810+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
811+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFreeExp");
812+
auto commandListLocked = commandListManager.lock();
813+
814+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent,
815+
UR_COMMAND_ENQUEUE_USM_FREE_EXP);
816+
auto [pWaitEvents, numWaitEvents] =
817+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
818+
819+
umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem);
820+
if (!hPool) {
821+
return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
822+
}
823+
824+
UsmPool *usmPool = nullptr;
825+
auto ret = umfPoolGetTag(hPool, (void **)&usmPool);
826+
if (ret != UMF_RESULT_SUCCESS || !usmPool) {
827+
// This should never happen
828+
return UR_RESULT_ERROR_UNKNOWN;
829+
}
830+
831+
size_t size = umfPoolMallocUsableSize(hPool, pMem);
832+
ur_event_handle_t poolEvent = nullptr;
833+
if (phEvent) {
834+
poolEvent = *phEvent;
835+
poolEvent->RefCount.increment();
836+
}
837+
usmPool->asyncPool.insert(pMem, size, poolEvent, this);
838+
839+
if (numWaitEvents > 0) {
840+
ZE2UR_CALL(
841+
zeCommandListAppendWaitOnEvents,
842+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
843+
}
844+
845+
if (zeSignalEvent) {
846+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
847+
(commandListLocked->getZeCommandList(), zeSignalEvent));
848+
}
849+
850+
return UR_RESULT_SUCCESS;
731851
}
732852

733853
ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp(
@@ -866,9 +986,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
866986
"ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp");
867987

868988
auto commandListLocked = commandListManager.lock();
989+
869990
auto zeSignalEvent =
870991
getSignalEvent(commandListLocked, phEvent, callerCommand);
871-
872992
auto [pWaitEvents, numWaitEvents] =
873993
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList,
874994
additionalWaitEvent);

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ {
6464

6565
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6666

67+
ur_result_t
68+
enqueueUSMAllocHelper(ur_usm_pool_handle_t pPool, const size_t size,
69+
const ur_exp_async_usm_alloc_properties_t *pProperties,
70+
uint32_t numEventsInWaitList,
71+
const ur_event_handle_t *phEventWaitList, void **ppMem,
72+
ur_event_handle_t *phEvent, ur_usm_type_t Type);
73+
6774
public:
6875
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6976
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)