From 82a5e886798cd517f2a46d73dc0f8336c2d3119a Mon Sep 17 00:00:00 2001 From: "Kornev, Nikita" Date: Mon, 30 Mar 2026 20:00:35 +0200 Subject: [PATCH 1/5] [SYCL] Improve enqueue function host task Spec: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc SYCL headers patch: https://github.com/intel/llvm/pull/21456 This is the second part of the host_task enqueue function implementation. L0 provides an API to launch host tasks - zeCommandListAppendHostFunction. This is API is used by the urEnqueueHostTaskExp UR function. This patch switch enqueue function host_task to use this API if it's possible. --- .../oneapi/experimental/enqueue_functions.hpp | 6 ++-- sycl/include/sycl/handler.hpp | 21 ++++++++++++++ sycl/source/detail/host_task.hpp | 22 +++++++++++++-- sycl/source/detail/scheduler/commands.cpp | 28 ++++++++++++++++--- sycl/source/handler.cpp | 10 +++++++ .../EnqueueFunctions/native_host_task.cpp | 22 +++++++++++++++ 6 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 sycl/test-e2e/EnqueueFunctions/native_host_task.cpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index dd8ddcea5ac33..b364e94090360 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -202,13 +202,15 @@ void host_task(sycl::queue Q, T &&hostTaskCallable, submit( std::move(Q), [&](sycl::handler &cgh) { - cgh.host_task(std::forward(hostTaskCallable)); + sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction( + cgh, std::forward(hostTaskCallable)); }, CodeLoc); } template void host_task(handler &CGH, T &&hostTaskCallable) { - CGH.host_task(std::forward(hostTaskCallable)); + sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction( + CGH, std::forward(hostTaskCallable)); } // TODO: Make overloads for scalar arguments for range. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 9ab1eb179b05d..e7dea52d07029 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1019,6 +1019,7 @@ class __SYCL_EXPORT handler { void setHandlerKernelBundle(SharedPtrT &&NewKernelBundleImpPtr); void SetHostTask(std::function Func); + void SetHostTaskFromExtEnqueueFunctions(std::function Func); void SetHostTask(std::function Func); template @@ -1036,6 +1037,19 @@ class __SYCL_EXPORT handler { SetHostTask(std::forward(Func)); } + template + std::enable_if_t< + detail::check_fn_signature, void()>::value> + host_task_from_enqueue_function_impl(FuncT &&Func) { + throwIfActionIsCreated(); + + // Need to copy these rather than move so that we can check associated + // accessors during finalize + setArgsToAssociatedAccessors(); + + SetHostTaskFromExtEnqueueFunctions(std::forward(Func)); + } + template std::enable_if_t, void(interop_handle)>::value> @@ -3004,6 +3018,13 @@ class HandlerAccess { Handler.internalProfilingTagImpl(); } + template + static std::enable_if_t< + detail::check_fn_signature, void()>::value> + hostTaskFromEnqueueFunction(handler &Handler, FuncT &&Func) { + Handler.host_task_from_enqueue_function_impl(std::forward(Func)); + } + template static void parallelForImpl(handler &Handler, RangeT Range, PropertiesT Props, kernel Kernel) { diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index f7e3feff8d0ef..83c9c503a5d3e 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -23,16 +23,32 @@ inline namespace _V1 { class interop_handle; namespace detail { class HostTask { + enum class HostTaskOrigin { + SYCLCoreAPI, + ExtEnqueueFunctionsAPI, + }; + std::function MHostTask; std::function MInteropTask; + HostTaskOrigin MOrigin; public: - HostTask() : MHostTask([]() {}) {} - HostTask(std::function &&Func) : MHostTask(Func) {} - HostTask(std::function &&Func) : MInteropTask(Func) {} + HostTask() : MHostTask([]() {}), MOrigin(HostTaskOrigin::SYCLCoreAPI) {} + HostTask(std::function &&Func, + bool IsFromExtEnqueueFunctionsAPI = false) + : MHostTask(std::move(Func)), + MOrigin(IsFromExtEnqueueFunctionsAPI + ? HostTaskOrigin::ExtEnqueueFunctionsAPI + : HostTaskOrigin::SYCLCoreAPI) {} + HostTask(std::function &&Func) + : MInteropTask(std::move(Func)), MOrigin(HostTaskOrigin::SYCLCoreAPI) {} bool isInteropTask() const { return !!MInteropTask; } + bool isCreatedFromEnqueueFunction() const { + return MOrigin == HostTaskOrigin::ExtEnqueueFunctionsAPI; + } + void call(HostProfilingInfo *HPI) { if (!GlobalHandler::instance().isOkToDefer()) { return; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 322c2a96bc191..9655a34b9d078 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -241,6 +241,11 @@ void InteropFreeFunc(ur_queue_handle_t, void *InteropData) { auto *Data = reinterpret_cast(InteropData); return Data->func(Data->ih); } + +void NativeHostTask(void *funcPtr) { + auto *func = static_cast *>(funcPtr); + (*func)(); +} } // namespace class DispatchHostTask { @@ -331,6 +336,7 @@ class DispatchHostTask { } try { + auto &Queue = HostTask.MQueue; // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { assert(HostTask.MQueue && @@ -338,7 +344,6 @@ class DispatchHostTask { interop_handle IH{MReqToMem, HostTask.MQueue}; // TODO: should all the backends that support this entry point use this // for host task? - auto &Queue = HostTask.MQueue; bool NativeCommandSupport = false; Queue->getAdapter().call( detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), @@ -356,14 +361,29 @@ class DispatchHostTask { // This entry point is needed in order to migrate memory across // devices in the same context for CUDA and HIP backends Queue->getAdapter().call( - HostTask.MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, + Queue->getHandleRef(), InteropFreeFunc, &CustomOpData, MReqUrMem.size(), MReqUrMem.data(), nullptr, 0, nullptr, nullptr); } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH); } - } else - HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } else { + if (HostTask.MHostTask->isCreatedFromEnqueueFunction()) { + bool NativeHostTaskSupport = false; + Queue->getAdapter().call( + detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), + UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, + sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); + if (NativeHostTaskSupport) + Queue->getAdapter().call( + Queue->getHandleRef(), NativeHostTask, + &HostTask.MHostTask->MHostTask, nullptr, 0, nullptr, nullptr); + else + HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } else { + HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } + } } catch (...) { auto CurrentException = std::current_exception(); #ifdef XPTI_ENABLE_INSTRUMENTATION diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 069d0226b3e61..4d1c689d6e377 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -771,6 +771,7 @@ detail::EventImplPtr handler::finalize() { // For commands other than kernel and host task submission, if an event has // not been requested, the queue supports events discarding, and the scheduler // could have been bypassed (not supported yet), the event can be skipped. + // TODO: check if it's possible to discard an event for host task. bool DiscardEvent = (type != detail::CGType::Kernel && type != detail::CGType::CodeplayHostTask && KernelSchedulerBypass && @@ -1653,6 +1654,15 @@ void handler::SetHostTask(std::function Func) { setType(detail::CGType::CodeplayHostTask); } +void handler::SetHostTaskFromExtEnqueueFunctions(std::function Func) { + range<1> r(1); + setNDRangeDescriptor(detail::nd_range_view(r)); + impl->MHostTask.reset( + new detail::HostTask(std::move(Func), /*IsFromExtEnqueueFunctionsAPI=*/ + true)); + setType(detail::CGType::CodeplayHostTask); +} + void handler::SetHostTask(std::function Func) { range<1> r(1); setNDRangeDescriptor(detail::nd_range_view(r)); diff --git a/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp new file mode 100644 index 0000000000000..b4ba40602ecf4 --- /dev/null +++ b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp @@ -0,0 +1,22 @@ +// Only L0V2 supports urEnqueueHostTaskExp. +// REQUIRES: level_zero + +// RUN: %{build} -o %t.out +// RUN: %{run} SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %t.out | FileCheck %s + +// CHECK: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP +// CHECK: ---> urEnqueueHostTaskExp +// CHECK: <--- urEnqueueHostTaskExp + +#include +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; + +int main() { + sycl::queue q; + + syclex::host_task(q, [=] {}); + q.wait(); +} From 31a12a5e995c2b5dd24e5cb7ca74059cdb9c6b6f Mon Sep 17 00:00:00 2001 From: "Kornev, Nikita" Date: Thu, 2 Apr 2026 18:43:54 +0200 Subject: [PATCH 2/5] fix-abi-tests --- sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/abi/sycl_symbols_windows.dump | 1 + 2 files changed, 2 insertions(+) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 0ca131c35a9e3..0e80426ea02c2 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3569,6 +3569,7 @@ _ZN4sycl3_V17handler28setArgsToAssociatedAccessorsEv _ZN4sycl3_V17handler28setStateExplicitKernelBundleEv _ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE +_ZN4sycl3_V17handler34SetHostTaskFromExtEnqueueFunctionsESt8functionIFvvEE _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreEm _ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 17cc90d9ddf4a..b966cfc4f25ab 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -634,6 +634,7 @@ ?RangeRoundingTrace@detail@_V1@sycl@@YA_NXZ ?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXVinterop_handle@_V1@sycl@@@Z@std@@@Z ?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z +?SetHostTaskFromExtEnqueueFunctions@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z ?SetKernelLaunchpropertiesIfNotEmpty@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$00@12@V312@0@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$01@12@V312@0@Z From f1fb80aac749ae591fef7d09099119c7e8d5f353 Mon Sep 17 00:00:00 2001 From: "Kornev, Nikita" Date: Tue, 7 Apr 2026 18:22:50 +0200 Subject: [PATCH 3/5] Fix pre-commit --- sycl/source/detail/scheduler/commands.cpp | 29 ++++++++++++++----- .../EnqueueFunctions/kernel_submit.cpp | 1 + .../kernel_submit_with_event.cpp | 3 +- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9655a34b9d078..25900e4fa798e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -242,9 +242,19 @@ void InteropFreeFunc(ur_queue_handle_t, void *InteropData) { return Data->func(Data->ih); } -void NativeHostTask(void *funcPtr) { - auto *func = static_cast *>(funcPtr); - (*func)(); +struct EnqueueHostTaskData { + explicit EnqueueHostTaskData(std::function HostTask) + : Func(std::move(HostTask)) {} + + std::function Func; +}; + +void NativeHostTask(void *Data) { + // Callback data is heap-allocated at enqueue time and released here once + // the backend invokes the host task callback. + auto HostTaskData = std::unique_ptr( + static_cast(Data)); + HostTaskData->Func(); } } // namespace @@ -374,12 +384,17 @@ class DispatchHostTask { detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); - if (NativeHostTaskSupport) + if (NativeHostTaskSupport) { + auto NativeHostTaskData = std::make_unique( + std::move(HostTask.MHostTask->MHostTask)); Queue->getAdapter().call( - Queue->getHandleRef(), NativeHostTask, - &HostTask.MHostTask->MHostTask, nullptr, 0, nullptr, nullptr); - else + Queue->getHandleRef(), NativeHostTask, NativeHostTaskData.get(), + nullptr, 0, nullptr, nullptr); + // Ownership is transferred to NativeHostTask callback on success. + (void)NativeHostTaskData.release(); + } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); } diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp index a7a8202a68af9..6fbda0ec8c35d 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp @@ -239,6 +239,7 @@ int main() { }); } } + Q.wait(); for (size_t I = 0; I < N; ++I) Failed += Check(Memory, 55, I, "host_task"); diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp index eff7c24312b41..4fad62cf7182e 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp @@ -236,9 +236,10 @@ int main() { for (size_t I = 0; I < N; ++I) MemAcc[I] = 55; }); - }).wait(); + }); } } + Q.wait(); for (size_t I = 0; I < N; ++I) Failed += Check(Memory, 55, I, "host_task"); From 248c85138d4c047f1c18e8f7d7447d187cbc5ac1 Mon Sep 17 00:00:00 2001 From: "Kornev, Nikita" Date: Wed, 8 Apr 2026 14:48:17 +0200 Subject: [PATCH 4/5] fixes & review suggestions --- sycl/source/detail/scheduler/commands.cpp | 20 ++++++++++++++++++- .../kernel_submit_with_event.cpp | 3 +-- .../EnqueueFunctions/native_host_task.cpp | 8 ++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 25900e4fa798e..b8223008dd855 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -387,11 +387,29 @@ class DispatchHostTask { if (NativeHostTaskSupport) { auto NativeHostTaskData = std::make_unique( std::move(HostTask.MHostTask->MHostTask)); + ur_event_handle_t HostTaskEvent{}; Queue->getAdapter().call( Queue->getHandleRef(), NativeHostTask, NativeHostTaskData.get(), - nullptr, 0, nullptr, nullptr); + nullptr, 0, nullptr, &HostTaskEvent); // Ownership is transferred to NativeHostTask callback on success. (void)NativeHostTaskData.release(); + + // Wait for the host task to complete asynchronously. Since + // urEnqueueHostTaskExp executes the callback asynchronously when + // UR host task support is available, we must wait for the returned + // event before notifying completion. This ensures proper dependency + // ordering and allows profiling/async-exception handlers to see the + // actual task completion rather than the enqueue time. + if (HostTaskEvent) { + try { + Queue->getAdapter().call(1, &HostTaskEvent); + } catch (...) { + auto CurrentException = std::current_exception(); + Queue->getAdapter().call(HostTaskEvent); + throw; + } + Queue->getAdapter().call(HostTaskEvent); + } } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); } diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp index 4fad62cf7182e..eff7c24312b41 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp @@ -236,10 +236,9 @@ int main() { for (size_t I = 0; I < N; ++I) MemAcc[I] = 55; }); - }); + }).wait(); } } - Q.wait(); for (size_t I = 0; I < N; ++I) Failed += Check(Memory, 55, I, "host_task"); diff --git a/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp index b4ba40602ecf4..bda195ebabac3 100644 --- a/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp +++ b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp @@ -1,8 +1,12 @@ // Only L0V2 supports urEnqueueHostTaskExp. -// REQUIRES: level_zero +// REQUIRES: level_zero_v2_adapter + +// UNSUPPORTED: windows && gpu-intel-gen12 +// UNSUPPORTED-INTENDED: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP is not +// supported on win&gen12. // RUN: %{build} -o %t.out -// RUN: %{run} SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %t.out | FileCheck %s +// RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s // CHECK: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP // CHECK: ---> urEnqueueHostTaskExp From 928130e7fe91bec73ac91388174c449ea9b62b67 Mon Sep 17 00:00:00 2001 From: "Kornev, Nikita" Date: Wed, 8 Apr 2026 16:01:06 +0200 Subject: [PATCH 5/5] format --- sycl/source/detail/scheduler/commands.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b8223008dd855..d0ca0667b2139 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -402,13 +402,16 @@ class DispatchHostTask { // actual task completion rather than the enqueue time. if (HostTaskEvent) { try { - Queue->getAdapter().call(1, &HostTaskEvent); + Queue->getAdapter().call( + 1, &HostTaskEvent); } catch (...) { auto CurrentException = std::current_exception(); - Queue->getAdapter().call(HostTaskEvent); + Queue->getAdapter().call( + HostTaskEvent); throw; } - Queue->getAdapter().call(HostTaskEvent); + Queue->getAdapter().call( + HostTaskEvent); } } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());