diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index dd8ddcea5ac33..b364e94090360 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -202,13 +202,15 @@ void host_task(sycl::queue Q, T &&hostTaskCallable, submit( std::move(Q), [&](sycl::handler &cgh) { - cgh.host_task(std::forward(hostTaskCallable)); + sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction( + cgh, std::forward(hostTaskCallable)); }, CodeLoc); } template void host_task(handler &CGH, T &&hostTaskCallable) { - CGH.host_task(std::forward(hostTaskCallable)); + sycl::detail::HandlerAccess::hostTaskFromEnqueueFunction( + CGH, std::forward(hostTaskCallable)); } // TODO: Make overloads for scalar arguments for range. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 9ab1eb179b05d..e7dea52d07029 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1019,6 +1019,7 @@ class __SYCL_EXPORT handler { void setHandlerKernelBundle(SharedPtrT &&NewKernelBundleImpPtr); void SetHostTask(std::function Func); + void SetHostTaskFromExtEnqueueFunctions(std::function Func); void SetHostTask(std::function Func); template @@ -1036,6 +1037,19 @@ class __SYCL_EXPORT handler { SetHostTask(std::forward(Func)); } + template + std::enable_if_t< + detail::check_fn_signature, void()>::value> + host_task_from_enqueue_function_impl(FuncT &&Func) { + throwIfActionIsCreated(); + + // Need to copy these rather than move so that we can check associated + // accessors during finalize + setArgsToAssociatedAccessors(); + + SetHostTaskFromExtEnqueueFunctions(std::forward(Func)); + } + template std::enable_if_t, void(interop_handle)>::value> @@ -3004,6 +3018,13 @@ class HandlerAccess { Handler.internalProfilingTagImpl(); } + template + static std::enable_if_t< + detail::check_fn_signature, void()>::value> + hostTaskFromEnqueueFunction(handler &Handler, FuncT &&Func) { + Handler.host_task_from_enqueue_function_impl(std::forward(Func)); + } + template static void parallelForImpl(handler &Handler, RangeT Range, PropertiesT Props, kernel Kernel) { diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index f7e3feff8d0ef..83c9c503a5d3e 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -23,16 +23,32 @@ inline namespace _V1 { class interop_handle; namespace detail { class HostTask { + enum class HostTaskOrigin { + SYCLCoreAPI, + ExtEnqueueFunctionsAPI, + }; + std::function MHostTask; std::function MInteropTask; + HostTaskOrigin MOrigin; public: - HostTask() : MHostTask([]() {}) {} - HostTask(std::function &&Func) : MHostTask(Func) {} - HostTask(std::function &&Func) : MInteropTask(Func) {} + HostTask() : MHostTask([]() {}), MOrigin(HostTaskOrigin::SYCLCoreAPI) {} + HostTask(std::function &&Func, + bool IsFromExtEnqueueFunctionsAPI = false) + : MHostTask(std::move(Func)), + MOrigin(IsFromExtEnqueueFunctionsAPI + ? HostTaskOrigin::ExtEnqueueFunctionsAPI + : HostTaskOrigin::SYCLCoreAPI) {} + HostTask(std::function &&Func) + : MInteropTask(std::move(Func)), MOrigin(HostTaskOrigin::SYCLCoreAPI) {} bool isInteropTask() const { return !!MInteropTask; } + bool isCreatedFromEnqueueFunction() const { + return MOrigin == HostTaskOrigin::ExtEnqueueFunctionsAPI; + } + void call(HostProfilingInfo *HPI) { if (!GlobalHandler::instance().isOkToDefer()) { return; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 322c2a96bc191..25900e4fa798e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -241,6 +241,21 @@ void InteropFreeFunc(ur_queue_handle_t, void *InteropData) { auto *Data = reinterpret_cast(InteropData); return Data->func(Data->ih); } + +struct EnqueueHostTaskData { + explicit EnqueueHostTaskData(std::function HostTask) + : Func(std::move(HostTask)) {} + + std::function Func; +}; + +void NativeHostTask(void *Data) { + // Callback data is heap-allocated at enqueue time and released here once + // the backend invokes the host task callback. + auto HostTaskData = std::unique_ptr( + static_cast(Data)); + HostTaskData->Func(); +} } // namespace class DispatchHostTask { @@ -331,6 +346,7 @@ class DispatchHostTask { } try { + auto &Queue = HostTask.MQueue; // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { assert(HostTask.MQueue && @@ -338,7 +354,6 @@ class DispatchHostTask { interop_handle IH{MReqToMem, HostTask.MQueue}; // TODO: should all the backends that support this entry point use this // for host task? - auto &Queue = HostTask.MQueue; bool NativeCommandSupport = false; Queue->getAdapter().call( detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), @@ -356,14 +371,34 @@ class DispatchHostTask { // This entry point is needed in order to migrate memory across // devices in the same context for CUDA and HIP backends Queue->getAdapter().call( - HostTask.MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, + Queue->getHandleRef(), InteropFreeFunc, &CustomOpData, MReqUrMem.size(), MReqUrMem.data(), nullptr, 0, nullptr, nullptr); } else { HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH); } - } else - HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } else { + if (HostTask.MHostTask->isCreatedFromEnqueueFunction()) { + bool NativeHostTaskSupport = false; + Queue->getAdapter().call( + detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), + UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, + sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); + if (NativeHostTaskSupport) { + auto NativeHostTaskData = std::make_unique( + std::move(HostTask.MHostTask->MHostTask)); + Queue->getAdapter().call( + Queue->getHandleRef(), NativeHostTask, NativeHostTaskData.get(), + nullptr, 0, nullptr, nullptr); + // Ownership is transferred to NativeHostTask callback on success. + (void)NativeHostTaskData.release(); + } else { + HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } + } else { + HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo()); + } + } } catch (...) { auto CurrentException = std::current_exception(); #ifdef XPTI_ENABLE_INSTRUMENTATION diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 069d0226b3e61..4d1c689d6e377 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -771,6 +771,7 @@ detail::EventImplPtr handler::finalize() { // For commands other than kernel and host task submission, if an event has // not been requested, the queue supports events discarding, and the scheduler // could have been bypassed (not supported yet), the event can be skipped. + // TODO: check if it's possible to discard an event for host task. bool DiscardEvent = (type != detail::CGType::Kernel && type != detail::CGType::CodeplayHostTask && KernelSchedulerBypass && @@ -1653,6 +1654,15 @@ void handler::SetHostTask(std::function Func) { setType(detail::CGType::CodeplayHostTask); } +void handler::SetHostTaskFromExtEnqueueFunctions(std::function Func) { + range<1> r(1); + setNDRangeDescriptor(detail::nd_range_view(r)); + impl->MHostTask.reset( + new detail::HostTask(std::move(Func), /*IsFromExtEnqueueFunctionsAPI=*/ + true)); + setType(detail::CGType::CodeplayHostTask); +} + void handler::SetHostTask(std::function Func) { range<1> r(1); setNDRangeDescriptor(detail::nd_range_view(r)); diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp index a7a8202a68af9..6fbda0ec8c35d 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit.cpp @@ -239,6 +239,7 @@ int main() { }); } } + Q.wait(); for (size_t I = 0; I < N; ++I) Failed += Check(Memory, 55, I, "host_task"); diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp index eff7c24312b41..4fad62cf7182e 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event.cpp @@ -236,9 +236,10 @@ int main() { for (size_t I = 0; I < N; ++I) MemAcc[I] = 55; }); - }).wait(); + }); } } + Q.wait(); for (size_t I = 0; I < N; ++I) Failed += Check(Memory, 55, I, "host_task"); diff --git a/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp new file mode 100644 index 0000000000000..b4ba40602ecf4 --- /dev/null +++ b/sycl/test-e2e/EnqueueFunctions/native_host_task.cpp @@ -0,0 +1,22 @@ +// Only L0V2 supports urEnqueueHostTaskExp. +// REQUIRES: level_zero + +// RUN: %{build} -o %t.out +// RUN: %{run} SYCL_UR_USE_LEVEL_ZERO_V2=1 SYCL_UR_TRACE=2 %t.out | FileCheck %s + +// CHECK: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP +// CHECK: ---> urEnqueueHostTaskExp +// CHECK: <--- urEnqueueHostTaskExp + +#include +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; + +int main() { + sycl::queue q; + + syclex::host_task(q, [=] {}); + q.wait(); +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 0ca131c35a9e3..0e80426ea02c2 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3569,6 +3569,7 @@ _ZN4sycl3_V17handler28setArgsToAssociatedAccessorsEv _ZN4sycl3_V17handler28setStateExplicitKernelBundleEv _ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE +_ZN4sycl3_V17handler34SetHostTaskFromExtEnqueueFunctionsESt8functionIFvvEE _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreEm _ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental18external_semaphoreE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 17cc90d9ddf4a..b966cfc4f25ab 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -634,6 +634,7 @@ ?RangeRoundingTrace@detail@_V1@sycl@@YA_NXZ ?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXVinterop_handle@_V1@sycl@@@Z@std@@@Z ?SetHostTask@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z +?SetHostTaskFromExtEnqueueFunctions@handler@_V1@sycl@@AEAAXV?$function@$$A6AXXZ@std@@@Z ?SetKernelLaunchpropertiesIfNotEmpty@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$00@12@V312@0@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$01@12@V312@0@Z