Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,15 @@ void host_task(sycl::queue Q, T &&hostTaskCallable,
submit(
std::move(Q),
[&](sycl::handler &cgh) {
cgh.host_task(std::forward<T>(hostTaskCallable));
sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction(
cgh, std::forward<T>(hostTaskCallable));
},
CodeLoc);
}

template <typename T> void host_task(handler &CGH, T &&hostTaskCallable) {
CGH.host_task(std::forward<T>(hostTaskCallable));
sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction(
CGH, std::forward<T>(hostTaskCallable));
}

// TODO: Make overloads for scalar arguments for range.
Expand Down
21 changes: 21 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,7 @@ class __SYCL_EXPORT handler {
void setHandlerKernelBundle(SharedPtrT &&NewKernelBundleImpPtr);

void SetHostTask(std::function<void()> Func);
void SetHostTaskFromExtEnqueueFunctions(std::function<void()> Func);
void SetHostTask(std::function<void(interop_handle)> Func);

template <typename FuncT>
Expand All @@ -1036,6 +1037,19 @@ class __SYCL_EXPORT handler {
SetHostTask(std::forward<FuncT>(Func));
}

template <typename FuncT>
std::enable_if_t<
detail::check_fn_signature<std::remove_reference_t<FuncT>, void()>::value>
host_task_from_enqueue_function_impl(FuncT &&Func) {
throwIfActionIsCreated();

// Need to copy these rather than move so that we can check associated
// accessors during finalize
setArgsToAssociatedAccessors();

SetHostTaskFromExtEnqueueFunctions(std::forward<FuncT>(Func));
}

template <typename FuncT>
std::enable_if_t<detail::check_fn_signature<std::remove_reference_t<FuncT>,
void(interop_handle)>::value>
Expand Down Expand Up @@ -3004,6 +3018,13 @@ class HandlerAccess {
Handler.internalProfilingTagImpl();
}

template <typename FuncT>
static std::enable_if_t<
detail::check_fn_signature<std::remove_reference_t<FuncT>, void()>::value>
hostTaskFromEnqueueFunction(handler &Handler, FuncT &&Func) {
Handler.host_task_from_enqueue_function_impl(std::forward<FuncT>(Func));
}

template <typename RangeT, typename PropertiesT>
static void parallelForImpl(handler &Handler, RangeT Range, PropertiesT Props,
kernel Kernel) {
Expand Down
22 changes: 19 additions & 3 deletions sycl/source/detail/host_task.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,32 @@ inline namespace _V1 {
class interop_handle;
namespace detail {
class HostTask {
enum class HostTaskOrigin {
SYCLCoreAPI,
ExtEnqueueFunctionsAPI,
};

std::function<void()> MHostTask;
std::function<void(interop_handle)> MInteropTask;
HostTaskOrigin MOrigin;

public:
HostTask() : MHostTask([]() {}) {}
HostTask(std::function<void()> &&Func) : MHostTask(Func) {}
HostTask(std::function<void(interop_handle)> &&Func) : MInteropTask(Func) {}
HostTask() : MHostTask([]() {}), MOrigin(HostTaskOrigin::SYCLCoreAPI) {}
HostTask(std::function<void()> &&Func,
bool IsFromExtEnqueueFunctionsAPI = false)
: MHostTask(std::move(Func)),
MOrigin(IsFromExtEnqueueFunctionsAPI
? HostTaskOrigin::ExtEnqueueFunctionsAPI
: HostTaskOrigin::SYCLCoreAPI) {}
HostTask(std::function<void(interop_handle)> &&Func)
: MInteropTask(std::move(Func)), MOrigin(HostTaskOrigin::SYCLCoreAPI) {}

bool isInteropTask() const { return !!MInteropTask; }

bool isCreatedFromEnqueueFunction() const {
return MOrigin == HostTaskOrigin::ExtEnqueueFunctionsAPI;
}

void call(HostProfilingInfo *HPI) {
if (!GlobalHandler::instance().isOkToDefer()) {
return;
Expand Down
43 changes: 39 additions & 4 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,21 @@ void InteropFreeFunc(ur_queue_handle_t, void *InteropData) {
auto *Data = reinterpret_cast<EnqueueNativeCommandData *>(InteropData);
return Data->func(Data->ih);
}

struct EnqueueHostTaskData {
explicit EnqueueHostTaskData(std::function<void()> HostTask)
: Func(std::move(HostTask)) {}

std::function<void()> Func;
};

void NativeHostTask(void *Data) {
// Callback data is heap-allocated at enqueue time and released here once
// the backend invokes the host task callback.
auto HostTaskData = std::unique_ptr<EnqueueHostTaskData>(
static_cast<EnqueueHostTaskData *>(Data));
HostTaskData->Func();
}
} // namespace

class DispatchHostTask {
Expand Down Expand Up @@ -331,14 +346,14 @@ class DispatchHostTask {
}

try {
auto &Queue = HostTask.MQueue;
// we're ready to call the user-defined lambda now
if (HostTask.MHostTask->isInteropTask()) {
assert(HostTask.MQueue &&
"Host task submissions should have an associated queue");
interop_handle IH{MReqToMem, HostTask.MQueue};
// TODO: should all the backends that support this entry point use this
// for host task?
auto &Queue = HostTask.MQueue;
bool NativeCommandSupport = false;
Queue->getAdapter().call<UrApiKind::urDeviceGetInfo>(
detail::getSyclObjImpl(Queue->get_device())->getHandleRef(),
Expand All @@ -356,14 +371,34 @@ class DispatchHostTask {
// This entry point is needed in order to migrate memory across
// devices in the same context for CUDA and HIP backends
Queue->getAdapter().call<UrApiKind::urEnqueueNativeCommandExp>(
HostTask.MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData,
Queue->getHandleRef(), InteropFreeFunc, &CustomOpData,
MReqUrMem.size(), MReqUrMem.data(), nullptr, 0, nullptr, nullptr);
} else {
HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(),
IH);
}
} else
HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
} else {
if (HostTask.MHostTask->isCreatedFromEnqueueFunction()) {
bool NativeHostTaskSupport = false;
Queue->getAdapter().call<UrApiKind::urDeviceGetInfo>(
detail::getSyclObjImpl(Queue->get_device())->getHandleRef(),
UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP,
sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr);
if (NativeHostTaskSupport) {
auto NativeHostTaskData = std::make_unique<EnqueueHostTaskData>(
std::move(HostTask.MHostTask->MHostTask));
Queue->getAdapter().call<UrApiKind::urEnqueueHostTaskExp>(
Queue->getHandleRef(), NativeHostTask, NativeHostTaskData.get(),
nullptr, 0, nullptr, nullptr);
// Ownership is transferred to NativeHostTask callback on success.
(void)NativeHostTaskData.release();
} else {
HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
}
} else {
HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
}
Comment on lines +390 to +400
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This path enqueues urEnqueueHostTaskExp without requesting an output event (phEvent == nullptr) and then immediately proceeds to NotifyHostTaskCompletion later in DispatchHostTask::operator(). If the UR host task executes asynchronously (as implied by UR’s API + conformance tests), the scheduler may treat the host task as complete too early, breaking dependency ordering across queues and making host-task profiling/async-exception behavior diverge. Request a UR event here and delay completion notification until that event signals (either urEventWait in this thread or an event callback-based completion path).

Copilot uses AI. Check for mistakes.
}
} catch (...) {
auto CurrentException = std::current_exception();
#ifdef XPTI_ENABLE_INSTRUMENTATION
Expand Down
10 changes: 10 additions & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ detail::EventImplPtr handler::finalize() {
// For commands other than kernel and host task submission, if an event has
// not been requested, the queue supports events discarding, and the scheduler
// could have been bypassed (not supported yet), the event can be skipped.
// TODO: check if it's possible to discard an event for host task.
bool DiscardEvent =
(type != detail::CGType::Kernel &&
type != detail::CGType::CodeplayHostTask && KernelSchedulerBypass &&
Expand Down Expand Up @@ -1653,6 +1654,15 @@ void handler::SetHostTask(std::function<void()> Func) {
setType(detail::CGType::CodeplayHostTask);
}

void handler::SetHostTaskFromExtEnqueueFunctions(std::function<void()> Func) {
range<1> r(1);
setNDRangeDescriptor(detail::nd_range_view(r));
impl->MHostTask.reset(
new detail::HostTask(std::move(Func), /*IsFromExtEnqueueFunctionsAPI=*/
true));
setType(detail::CGType::CodeplayHostTask);
}

void handler::SetHostTask(std::function<void(interop_handle)> Func) {
range<1> r(1);
setNDRangeDescriptor(detail::nd_range_view(r));
Expand Down
1 change: 1 addition & 0 deletions sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ int main() {
});
}
}
Q.wait();
for (size_t I = 0; I < N; ++I)
Failed += Check(Memory, 55, I, "host_task");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,10 @@ int main() {
for (size_t I = 0; I < N; ++I)
MemAcc[I] = 55;
});
}).wait();
});
}
}
Q.wait();
for (size_t I = 0; I < N; ++I)
Failed += Check(Memory, 55, I, "host_task");

Expand Down
22 changes: 22 additions & 0 deletions sycl/test-e2e/EnqueueFunctions/native_host_task.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Only L0V2 supports urEnqueueHostTaskExp.
// REQUIRES: level_zero

// RUN: %{build} -o %t.out
// RUN: %{run} SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %t.out | FileCheck %s
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RUN line is passing SYCL_UR_USE_LEVEL_ZERO_V2=1 / SYCL_UR_TRACE=2 as command-line arguments to %{run}, not as environment variables. Other e2e tests set UR tracing via env ... %{run} ... and also redirect stderr to stdout so FileCheck can see trace output. Update this RUN line to use env SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s (or the repo’s established equivalent).

Suggested change
// RUN: %{run} SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %t.out | FileCheck %s
// RUN: env SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s

Copilot uses AI. Check for mistakes.

// CHECK: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP
// CHECK: ---> urEnqueueHostTaskExp
// CHECK: <--- urEnqueueHostTaskExp

#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
#include <sycl/properties/all_properties.hpp>

namespace syclex = sycl::ext::oneapi::experimental;

int main() {
sycl::queue q;

syclex::host_task(q, [=] {});
q.wait();
}
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3569,6 +3569,7 @@ _ZN4sycl3_V17handler28setArgsToAssociatedAccessorsEv
_ZN4sycl3_V17handler28setStateExplicitKernelBundleEv
_ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm
_ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE
_ZN4sycl3_V17handler34SetHostTaskFromExtEnqueueFunctionsESt8functionIFvvEE
_ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE
_ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreEm
_ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,7 @@
?RangeRoundingTrace@detail@_V1@sycl@@YA_NXZ
?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXVinterop_handle@_V1@sycl@@@Z@std@@@Z
?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z
?SetHostTaskFromExtEnqueueFunctions@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z
?SetKernelLaunchpropertiesIfNotEmpty@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z
?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$00@12@V312@0@Z
?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$01@12@V312@0@Z
Expand Down
Loading