Skip to content

[UR][Offload] Event waiting #19594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 135 additions & 47 deletions unified-runtime/source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,126 @@
#include "queue.hpp"
#include "ur2offload.hpp"

namespace {
ol_result_t waitOnEvents(ol_queue_handle_t Queue,
const ur_event_handle_t *UrEvents, size_t NumEvents) {
if (NumEvents) {
std::vector<ol_event_handle_t> OlEvents;
OlEvents.reserve(NumEvents);
for (size_t I = 0; I < NumEvents; I++) {
OlEvents.push_back(UrEvents[I]->OffloadEvent);
}

olWaitEvents(Queue, OlEvents.data(), NumEvents);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This result is never checked

}
return nullptr;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: OL_SUCCESS is clearer to someone less familiar with the Offload API

}

ol_result_t makeEvent(ur_command_t Type, ol_queue_handle_t OlQueue,
ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) {
if (UrEvent) {
auto *Event = new ur_event_handle_t_(Type, UrQueue);
if (auto Res = olCreateEvent(OlQueue, &Event->OffloadEvent)) {
delete Event;
return Res;
};
*UrEvent = Event;
}
return nullptr;
}

template <bool Barrier>
ur_result_t doWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
constexpr ur_command_t TYPE =
Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT;
ol_queue_handle_t TargetQueue;
if (!numEventsInWaitList && hQueue->isInOrder()) {
// In order queue so all work is done in submission order, so it's a
// no-op
if (phEvent) {
OL_RETURN_ON_ERR(hQueue->nextQueue(TargetQueue));
OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));
}
return UR_RESULT_SUCCESS;
}
OL_RETURN_ON_ERR(hQueue->nextQueue(TargetQueue));

if (!numEventsInWaitList) {
// "If the event list is empty, it waits for all previously enqueued
// commands to complete."

// Create events on each active queue for an arbitrary thread to block on
// TODO: Can we efficiently check if each thread is "finished" rather than
// creating an event?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stream_queue_t::syncEvents has some optimizations to only sync streams that need to be

std::vector<ol_event_handle_t> OffloadHandles{};
for (auto *Q : hQueue->OffloadQueues) {
if (Q == nullptr) {
break;
}
if (Q == TargetQueue) {
continue;
}
OL_RETURN_ON_ERR(olCreateEvent(Q, &OffloadHandles.emplace_back()));
}
OL_RETURN_ON_ERR(olWaitEvents(TargetQueue, OffloadHandles.data(),
OffloadHandles.size()));
} else {
OL_RETURN_ON_ERR(
waitOnEvents(TargetQueue, phEventWaitList, numEventsInWaitList));
}

OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));

if constexpr (Barrier) {
ol_event_handle_t BarrierEvent;
if (phEvent) {
BarrierEvent = (*phEvent)->OffloadEvent;
} else {
OL_RETURN_ON_ERR(olCreateEvent(TargetQueue, &BarrierEvent));
}

// Ensure any newly created work waits on this barrier
hQueue->Barrier.store(BarrierEvent);

// Block all existing threads on the barrier
for (auto *Q : hQueue->OffloadQueues) {
if (Q == nullptr) {
break;
}
if (Q == TargetQueue) {
continue;
}
OL_RETURN_ON_ERR(olWaitEvents(Q, &BarrierEvent, 1));
}
}

return UR_RESULT_SUCCESS;
}
} // namespace

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return doWait<false>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return doWait<true>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));

(void)pGlobalWorkOffset;

Expand Down Expand Up @@ -67,20 +177,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
LaunchArgs.GroupSize.z = GroupSize[2];
LaunchArgs.DynSharedMemory = 0;

ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(olLaunchKernel(
Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), &LaunchArgs));

if (phEvent) {
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) {
delete Event;
return offloadResultToUR(Res);
};
*phEvent = Event;
}
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent));
return UR_RESULT_SUCCESS;
}

Expand All @@ -103,10 +204,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
size_t size, bool blocking, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));

if (blocking) {
OL_RETURN_ON_ERR(
Expand All @@ -117,8 +217,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
return UR_RESULT_SUCCESS;
}

ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(
olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
if (phEvent) {
Expand Down Expand Up @@ -192,17 +290,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
numEventsInWaitList, phEventWaitList, phEvent);
}

ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue,
ur_event_handle_t *phEvent) {
// This path is a no-op, but we can't output a real event because
// Offload doesn't currently support creating arbitrary events, and we
// don't know the last real event in the queue. Instead we just have to
// wait on the whole queue and then return an empty (implicitly
// finished) event.
*phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue);
return urQueueFinish(hQueue);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
ur_map_flags_t mapFlags, size_t offset, size_t size,
Expand All @@ -226,15 +313,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
MapPtr, numEventsInWaitList,
phEventWaitList, phEvent);
} else {
if (IsPinned) {
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
// implemented we can call it on the wait list.
}

if (phEvent) {
enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
} else if (numEventsInWaitList || phEvent) {
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
// Out-of-order queues running no-op work only have side effects if there
// is an output event
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
}
OL_RETURN_ON_ERR(
makeEvent(UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
}
*ppRetMap = MapPtr;

Expand All @@ -260,15 +348,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
Result = urEnqueueMemBufferWrite(
hQueue, hMem, true, Map->MapOffset, Map->MapSize, pMappedPtr,
numEventsInWaitList, phEventWaitList, phEvent);
} else {
if (IsPinned) {
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
// implemented we can call it on the wait list.
}

if (phEvent) {
enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
} else if (numEventsInWaitList || phEvent) {
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
// Out-of-order queues running no-op work only have side effects if there
// is an output event
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
}
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_MEM_UNMAP, Queue, hQueue, phEvent));
}
BufferImpl.unmap(pMappedPtr);

Expand Down
18 changes: 13 additions & 5 deletions unified-runtime/source/adapters/offload/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ struct ur_queue_handle_t_ : RefCounted {
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
? 1
: OOO_QUEUE_POOL_SIZE),
QueueOffset(0), OffloadDevice(Device), UrContext(UrContext),
Flags(Flags) {}
QueueOffset(0), Barrier(nullptr), OffloadDevice(Device),
UrContext(UrContext), Flags(Flags) {}

// In-order queues only have one element here, while out of order queues have
// a bank of queues to use. We rotate through them round robin instead of
Expand All @@ -35,19 +35,27 @@ struct ur_queue_handle_t_ : RefCounted {
// `stream_queue_t`. In the future, if we want more performance or it
// simplifies the implementation of a feature, we can consider using it.
std::vector<ol_queue_handle_t> OffloadQueues;
size_t QueueOffset;
std::atomic<size_t> QueueOffset;
std::atomic<ol_event_handle_t> Barrier;
ol_device_handle_t OffloadDevice;
ur_context_handle_t UrContext;
ur_queue_flags_t Flags;

bool isInOrder() const { return OffloadQueues.size() == 1; }

ol_result_t nextQueue(ol_queue_handle_t &Handle) {
auto &Slot = OffloadQueues[QueueOffset++];
QueueOffset %= OffloadQueues.size();
auto &Slot = OffloadQueues[(QueueOffset++) % OffloadQueues.size()];

if (!Slot) {
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
return Res;
}

if (auto Event = Barrier.load()) {
if (auto Res = olWaitEvents(Slot, &Event, 1)) {
return Res;
}
}
}

Handle = Slot;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
}
pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead;
pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
pDdiTable->pfnEventsWait = nullptr;
pDdiTable->pfnEventsWaitWithBarrier = nullptr;
pDdiTable->pfnEventsWait = urEnqueueEventsWait;
pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
pDdiTable->pfnMemBufferCopy = nullptr;
pDdiTable->pfnMemBufferCopyRect = nullptr;
Expand Down
Loading