Skip to content

Commit 443b275

Browse files
committed
[SYCL][UR][L0 v2] Implement ooo queue using ooo command-list
1 parent 121c876 commit 443b275

File tree

3 files changed

+153
-225
lines changed

3 files changed

+153
-225
lines changed

unified-runtime/source/adapters/level_zero/v2/queue_create.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) {
4545
}
4646

4747
static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) {
48-
event_flags_t eventFlags = EVENT_FLAGS_COUNTER;
48+
bool ooo = flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
49+
event_flags_t eventFlags = ooo ? 0 : EVENT_FLAGS_COUNTER;
4950
if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE)
5051
eventFlags |= EVENT_FLAGS_PROFILING_ENABLED;
5152
return eventFlags;
@@ -90,8 +91,6 @@ ur_result_t urQueueCreateWithNativeHandle(
9091
ur_native_handle_t hNativeQueue, ur_context_handle_t hContext,
9192
ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties,
9293
ur_queue_handle_t *phQueue) try {
93-
// TODO: For now, always assume it's immediate, in-order
94-
9594
bool ownNativeHandle = pProperties ? pProperties->isNativeHandleOwned : false;
9695
ur_queue_flags_t flags = 0;
9796

@@ -119,9 +118,12 @@ ur_result_t urQueueCreateWithNativeHandle(
119118
}
120119
});
121120

122-
*phQueue = ur_queue_handle_t_::create<v2::ur_queue_immediate_in_order_t>(
123-
hContext, hDevice, std::move(commandListHandle),
124-
v2::eventFlagsFromQueueFlags(flags), flags);
121+
v2::event_flags_t eventFlags = 0;
122+
if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE)
123+
eventFlags = v2::EVENT_FLAGS_PROFILING_ENABLED;
124+
125+
*phQueue = ur_queue_handle_t_::create<v2::ur_queue_immediate_out_of_order_t>(
126+
hContext, hDevice, std::move(commandListHandle), eventFlags, flags);
125127

126128
return UR_RESULT_SUCCESS;
127129
} catch (...) {

unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp

Lines changed: 29 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -14,34 +14,30 @@
1414

1515
namespace v2 {
1616

17-
template <size_t N>
18-
std::array<ur_command_list_manager, N> createCommandListManagers(
19-
ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal,
20-
ze_command_queue_priority_t priority, std::optional<int32_t> index) {
21-
return createArrayOf<ur_command_list_manager, N>([&](size_t) {
22-
return ur_command_list_manager(
23-
hContext, hDevice,
24-
hContext->getCommandListCache().getImmediateCommandList(
25-
hDevice->ZeDevice,
26-
{true, ordinal, true /* always enable copy offload */},
27-
ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index));
28-
});
29-
}
30-
3117
ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t(
3218
ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal,
3319
ze_command_queue_priority_t priority, std::optional<int32_t> index,
3420
event_flags_t eventFlags, ur_queue_flags_t flags)
3521
: hContext(hContext), hDevice(hDevice),
3622
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
3723
.borrow(hDevice->Id.value(), eventFlags)),
38-
commandListManagers(createCommandListManagers<numCommandLists>(
39-
hContext, hDevice, ordinal, priority, index)),
40-
flags(flags) {
41-
for (size_t i = 0; i < numCommandLists; i++) {
42-
barrierEvents[i] = eventPool->allocate();
43-
}
44-
}
24+
commandListManager(
25+
hContext, hDevice,
26+
hContext->getCommandListCache().getImmediateCommandList(
27+
hDevice->ZeDevice,
28+
{false, ordinal, true /* always enable copy offload */},
29+
ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)),
30+
flags(flags) {}
31+
32+
ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t(
33+
ur_context_handle_t hContext, ur_device_handle_t hDevice,
34+
raii::command_list_unique_handle commandListHandle,
35+
event_flags_t eventFlags, ur_queue_flags_t flags)
36+
: hContext(hContext), hDevice(hDevice),
37+
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
38+
.borrow(hDevice->Id.value(), eventFlags)),
39+
commandListManager(hContext, hDevice, std::move(commandListHandle)),
40+
flags(flags) {}
4541

4642
ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo(
4743
ur_queue_info_t propName, size_t propSize, void *pPropValue,
@@ -72,13 +68,7 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo(
7268
}
7369
};
7470

75-
auto commandListManagersLocked = commandListManagers.lock();
76-
77-
bool empty = std::all_of(
78-
commandListManagersLocked->begin(), commandListManagersLocked->end(),
79-
[&](auto &cmdListManager) {
80-
return isCmdListEmpty(cmdListManager.getZeCommandList());
81-
});
71+
bool empty = isCmdListEmpty(commandListManager.lock()->getZeCommandList());
8272

8373
return ReturnValue(empty);
8474
}
@@ -96,8 +86,7 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo(
9686
ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle(
9787
ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) {
9888
*phNativeQueue = reinterpret_cast<ur_native_handle_t>(
99-
(*commandListManagers.get_no_lock())[getNextCommandListId()]
100-
.getZeCommandList());
89+
commandListManager.get_no_lock()->getZeCommandList());
10190
if (pDesc && pDesc->pNativeData) {
10291
// pNativeData == isImmediateQueue
10392
*(reinterpret_cast<int32_t *>(pDesc->pNativeData)) = 1;
@@ -108,13 +97,15 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle(
10897
ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() {
10998
TRACK_SCOPE_LATENCY("ur_queue_immediate_out_of_order_t::queueFinish");
11099

111-
auto commandListManagersLocked = commandListManagers.lock();
100+
auto commandListManagerLocked = commandListManager.lock();
101+
102+
ZE2UR_CALL(zeCommandListHostSynchronize,
103+
(commandListManagerLocked->getZeCommandList(), UINT64_MAX));
112104

113-
for (size_t i = 0; i < numCommandLists; i++) {
114-
ZE2UR_CALL(zeCommandListHostSynchronize,
115-
(commandListManagersLocked[i].getZeCommandList(), UINT64_MAX));
116-
UR_CALL(commandListManagersLocked[i].releaseSubmittedKernels());
105+
for (auto &event : pendingEvents) {
106+
event->release();
117107
}
108+
pendingEvents.clear();
118109

119110
hContext->getAsyncPool()->cleanupPoolsForQueue(this);
120111
hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) {
@@ -132,10 +123,6 @@ ur_result_t ur_queue_immediate_out_of_order_t::queueFlush() {
132123
ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() {
133124
try {
134125
UR_CALL_THROWS(queueFinish());
135-
136-
for (size_t i = 0; i < numCommandLists; i++) {
137-
barrierEvents[i]->release();
138-
}
139126
} catch (...) {
140127
// Ignore errors during destruction
141128
}
@@ -146,43 +133,9 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier(
146133
ur_event_handle_t *phEvent) {
147134
TRACK_SCOPE_LATENCY(
148135
"ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier");
149-
// Since we use L0 in-order command lists, we don't need a real L0 barrier,
150-
// just wait for requested events in potentially different queues and add a
151-
// "barrier" event signal because it is already guaranteed that previous
152-
// commands in this queue are completed when the signal is started. However,
153-
// we do need to use barrier if profiling is enabled: see
154-
// zeCommandListAppendWaitOnEvents
155-
bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
156-
auto barrierFn = needsRealBarrier
157-
? &ur_command_list_manager::appendEventsWaitWithBarrier
158-
: &ur_command_list_manager::appendEventsWait;
159-
160-
auto commandListManagersLocked = commandListManagers.lock();
161-
162-
// Enqueue wait for the user-provider events on the first command list.
163-
UR_CALL(commandListManagersLocked[0].appendEventsWait(
164-
numEventsInWaitList, phEventWaitList, barrierEvents[0]));
165-
166-
// Request barrierEvents[id] to be signaled on remaining command lists.
167-
for (size_t id = 1; id < numCommandLists; id++) {
168-
UR_CALL(commandListManagersLocked[id].appendEventsWait(0, nullptr,
169-
barrierEvents[id]));
170-
}
171-
172-
// Enqueue barriers on all command lists by waiting on barrierEvents.
173-
174-
if (phEvent) {
175-
UR_CALL(
176-
std::invoke(barrierFn, commandListManagersLocked[0], numCommandLists,
177-
barrierEvents.data(),
178-
createEventIfRequested(eventPool.get(), phEvent, this)));
179-
}
180-
181-
for (size_t id = phEvent ? 1 : 0; id < numCommandLists; id++) {
182-
UR_CALL(std::invoke(barrierFn, commandListManagersLocked[0],
183-
numCommandLists, barrierEvents.data(), nullptr));
184-
}
185-
136+
UR_CALL(commandListManager.lock()->appendEventsWaitWithBarrier(
137+
numEventsInWaitList, phEventWaitList,
138+
createEventAndStoreIfRequested(phEvent)));
186139
return UR_RESULT_SUCCESS;
187140
}
188141

0 commit comments

Comments
 (0)