Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,6 @@ class event_impl {
return std::make_shared<event_impl>(queue, private_tag{});
}

static std::shared_ptr<event_impl> create_discarded_event() {
return std::make_shared<event_impl>(HostEventState::HES_Discarded,
private_tag{});
}

static std::shared_ptr<event_impl> create_completed_host_event() {
return std::make_shared<event_impl>(HostEventState::HES_Complete,
private_tag{});
Expand Down
98 changes: 46 additions & 52 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ template <> device queue_impl::get_info<info::queue::device>() const {
return get_device();
}

static event
static EventImplPtr
prepareSYCLEventAssociatedWithQueue(detail::queue_impl &QueueImpl) {
auto EventImpl = detail::event_impl::create_device_event(QueueImpl);
EventImpl->setContextImpl(QueueImpl.getContextImpl());
EventImpl->setStateIncomplete();
return detail::createSyclObjFromImpl<event>(std::move(EventImpl));
return EventImpl;
}

const std::vector<event> &
Expand All @@ -109,9 +109,9 @@ queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
return MutableVec;
}

event queue_impl::memset(void *Ptr, int Value, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
EventImplPtr queue_impl::memset(void *Ptr, int Value, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
#if XPTI_ENABLE_INSTRUMENTATION
// We need a code pointer value and we use the object ptr; if code location
// information is available, we will have function name and source file
Expand Down Expand Up @@ -172,9 +172,10 @@ void report(const code_location &CodeLoc) {
std::cout << '\n';
}

event queue_impl::memcpy(void *Dest, const void *Src, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent, const code_location &CodeLoc) {
EventImplPtr queue_impl::memcpy(void *Dest, const void *Src, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
const code_location &CodeLoc) {
#if XPTI_ENABLE_INSTRUMENTATION
// We need a code pointer value and we duse the object ptr; If code location
// is available, we use the source file information along with the object
Expand Down Expand Up @@ -216,21 +217,20 @@ event queue_impl::memcpy(void *Dest, const void *Src, size_t Count,
MemoryManager::copy_usm, Src, *this, Count, Dest);
}

event queue_impl::mem_advise(const void *Ptr, size_t Length,
ur_usm_advice_flags_t Advice,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
EventImplPtr queue_impl::mem_advise(const void *Ptr, size_t Length,
ur_usm_advice_flags_t Advice,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
return submitMemOpHelper(
DepEvents, CallerNeedsEvent,
[&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); },
MemoryManager::advise_usm, Ptr, *this, Length, Advice);
}

event queue_impl::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
bool IsDeviceImageScope, size_t NumBytes,
size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
EventImplPtr queue_impl::memcpyToDeviceGlobal(
void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope,
size_t NumBytes, size_t Offset, const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
return submitMemOpHelper(
DepEvents, CallerNeedsEvent,
[&](handler &CGH) {
Expand All @@ -241,12 +241,10 @@ event queue_impl::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
*this, NumBytes, Offset, Src);
}

event queue_impl::memcpyFromDeviceGlobal(void *Dest,
const void *DeviceGlobalPtr,
bool IsDeviceImageScope,
size_t NumBytes, size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
EventImplPtr queue_impl::memcpyFromDeviceGlobal(
void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope,
size_t NumBytes, size_t Offset, const std::vector<event> &DepEvents,
bool CallerNeedsEvent) {
return submitMemOpHelper(
DepEvents, CallerNeedsEvent,
[&](handler &CGH) {
Expand Down Expand Up @@ -384,7 +382,8 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
}
}

return EventImpl;
// TODO Avoid event creation in the first place if it's not needed
return CallerNeedsEvent ? EventImpl : nullptr;
Comment on lines +385 to +386
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In queue_impl::submit_impl(), the new "return CallerNeedsEvent ? EventImpl : nullptr" only applies to the main path; the earlier noLastEventPath branch returns finalizeHandlerInOrderNoEventsUnlocked(Handler) directly, which can still yield a non-null EventImplPtr even when CallerNeedsEvent=false. This can violate the CallerNeedsEvent contract and can trip the assert in submitWithHandler() (ReturnEvent==false but submit_impl returns a non-null pointer). Consider routing the noLastEventPath through the same tail return logic (or at least applying the same CallerNeedsEvent filtering there).

Copilot uses AI. Check for mistakes.
}

EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
Expand Down Expand Up @@ -786,31 +785,28 @@ detail::EventImplPtr queue_impl::submit_direct(
}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
HandlerFuncT HandlerFunc) {
detail::SubmissionInfo SI{};
EventImplPtr queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
HandlerFuncT HandlerFunc) {
auto L = [&](handler &CGH) {
CGH.depends_on(DepEvents);
HandlerFunc(CGH);
};
detail::type_erased_cgfo_ty CGF{L};

if (!CallerNeedsEvent && isInOrder()) {
submit_without_event(CGF, SI,
/*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
return createSyclObjFromImpl<event>(event_impl::create_discarded_event());
}
return submit_with_event(CGF, SI,
/*CodeLoc*/ {}, /*IsTopCodeLoc*/ true);
const bool ReturnEvent = CallerNeedsEvent || !isInOrder();
EventImplPtr EventImpl =
submit_impl(CGF, ReturnEvent, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true,
/*SubmissionInfo*/ {});
assert(ReturnEvent == !!EventImpl);
return EventImpl;
}

template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>
event queue_impl::submitMemOpHelper(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
HandlerFuncT HandlerFunc,
MemOpFuncT MemOpFunc,
MemOpArgTs &&...MemOpArgs) {
EventImplPtr
queue_impl::submitMemOpHelper(const std::vector<event> &DepEvents,
bool CallerNeedsEvent, HandlerFuncT HandlerFunc,
MemOpFuncT MemOpFunc, MemOpArgTs &&...MemOpArgs) {
// We need to submit command and update the last event under same lock if we
// have in-order queue.
{
Expand All @@ -833,41 +829,39 @@ event queue_impl::submitMemOpHelper(const std::vector<event> &DepEvents,
getUrEvents(ExpandedDepEvents),
/*PiEvent*/ nullptr);

return createSyclObjFromImpl<event>(
event_impl::create_discarded_event());
return nullptr;
}

event ResEvent = prepareSYCLEventAssociatedWithQueue(*this);
const auto &EventImpl = detail::getSyclObjImpl(ResEvent);
EventImplPtr ResEventImpl = prepareSYCLEventAssociatedWithQueue(*this);
{
NestedCallsTracker tracker;
ur_event_handle_t UREvent = nullptr;
EventImpl->setSubmissionTime();
ResEventImpl->setSubmissionTime();
MemOpFunc(std::forward<MemOpArgTs>(MemOpArgs)...,
getUrEvents(ExpandedDepEvents), &UREvent);
EventImpl->setHandle(UREvent);
EventImpl->setEnqueued();
ResEventImpl->setHandle(UREvent);
ResEventImpl->setEnqueued();
// connect returned event with dependent events
if (!isInOrder()) {
std::vector<EventImplPtr> &ExpandedDepEventImplPtrs =
EventImpl->getPreparedDepsEvents();
ResEventImpl->getPreparedDepsEvents();
ExpandedDepEventImplPtrs.reserve(ExpandedDepEvents.size());
for (const event &DepEvent : ExpandedDepEvents)
ExpandedDepEventImplPtrs.push_back(
detail::getSyclObjImpl(DepEvent));

// EventImpl is local for current thread, no need to lock.
EventImpl->cleanDepEventsThroughOneLevelUnlocked();
// ResEventImpl is local for current thread, no need to lock.
ResEventImpl->cleanDepEventsThroughOneLevelUnlocked();
}
}

if (isInOrder() && !isNoEventsMode) {
auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
EventToStoreIn = EventImpl;
EventToStoreIn = ResEventImpl;
}

return ResEvent;
return ResEventImpl;
}
}
return submitWithHandler(DepEvents, CallerNeedsEvent, HandlerFunc);
Expand Down
49 changes: 27 additions & 22 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// dependencies.
/// \param CallerNeedsEvent specifies if the caller expects a usable event.
/// \return an event representing fill operation.
event memset(void *Ptr, int Value, size_t Count,
const std::vector<event> &DepEvents, bool CallerNeedsEvent);
EventImplPtr memset(void *Ptr, int Value, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent);
Comment on lines 563 to +567
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doxygen for memset/memcpy/mem_advise still reads as if a (non-null) event is always returned, but these functions now return EventImplPtr and can return nullptr when CallerNeedsEvent is false. Please update the "\return"/API comment to explicitly document the nullptr behavior so internal callers know they must handle it.

Copilot uses AI. Check for mistakes.
/// Copies data from one memory region to another, both pointed by
/// USM pointers.
///
Expand All @@ -574,9 +575,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// dependencies.
/// \param CallerNeedsEvent specifies if the caller expects a usable event.
/// \return an event representing copy operation.
event memcpy(void *Dest, const void *Src, size_t Count,
const std::vector<event> &DepEvents, bool CallerNeedsEvent,
const code_location &CodeLoc);
EventImplPtr memcpy(void *Dest, const void *Src, size_t Count,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent, const code_location &CodeLoc);
/// Provides additional information to the underlying runtime about how
/// different allocations are used.
///
Expand All @@ -587,8 +588,10 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// dependencies.
/// \param CallerNeedsEvent specifies if the caller expects a usable event.
/// \return an event representing advise operation.
event mem_advise(const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice,
const std::vector<event> &DepEvents, bool CallerNeedsEvent);
EventImplPtr mem_advise(const void *Ptr, size_t Length,
ur_usm_advice_flags_t Advice,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent);

static ThreadPool &getThreadPool() {
return GlobalHandler::instance().getHostTaskThreadPool();
Expand All @@ -606,15 +609,16 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {

bool queue_empty() const;

event memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
bool IsDeviceImageScope, size_t NumBytes,
size_t Offset, const std::vector<event> &DepEvents,
bool CallerNeedsEvent);
event memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
bool IsDeviceImageScope, size_t NumBytes,
size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent);
EventImplPtr memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src,
bool IsDeviceImageScope, size_t NumBytes,
size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent);
EventImplPtr memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
bool IsDeviceImageScope, size_t NumBytes,
size_t Offset,
const std::vector<event> &DepEvents,
bool CallerNeedsEvent);

void setCommandGraphUnlocked(
const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
Expand Down Expand Up @@ -960,8 +964,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// \param HandlerFunc is a function that submits the operation with a
/// handler.
template <typename HandlerFuncT>
event submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent, HandlerFuncT HandlerFunc);
EventImplPtr submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
HandlerFuncT HandlerFunc);

/// Performs submission of a memory operation directly if scheduler can be
/// bypassed, or with a handler otherwise.
Expand All @@ -979,10 +984,10 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// \return an event representing the submitted operation.
template <typename HandlerFuncT, typename MemMngrFuncT,
typename... MemMngrArgTs>
event submitMemOpHelper(const std::vector<event> &DepEvents,
bool CallerNeedsEvent, HandlerFuncT HandlerFunc,
MemMngrFuncT MemMngrFunc,
MemMngrArgTs &&...MemOpArgs);
EventImplPtr
submitMemOpHelper(const std::vector<event> &DepEvents, bool CallerNeedsEvent,
HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc,
MemMngrArgTs &&...MemOpArgs);

#ifdef XPTI_ENABLE_INSTRUMENTATION
// When instrumentation is enabled emits trace event for wait begin and
Expand Down
Loading
Loading