diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index dc10a5f079d53..12837c482fbbd 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -44,8 +44,9 @@ enum DataLessPropKind { GraphDependOnAllLeaves = 19, GraphUpdatable = 20, GraphEnableProfiling = 21, + GraphEnableNativeRecording = 22, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 21, + LastKnownDataLessPropKind = 22, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def b/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def index ce177a9bb0a16..451ddbb834be4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def +++ b/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def @@ -22,5 +22,9 @@ __SYCL_DATA_LESS_PROP(property::graph, updatable, GraphUpdatable) /// events returned by submissions of the executable graph __SYCL_DATA_LESS_PROP(property::graph, enable_profiling, GraphEnableProfiling) +/// Property passed to command_graph constructor to enable native recording +/// capability for improved performance when supported by the backend. +__SYCL_DATA_LESS_PROP(property::graph, enable_native_recording, GraphEnableNativeRecording) + #undef __SYCL_DATA_LESS_PROP #undef __SYCL_MANUALLY_DEFINED_PROP diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp index 089eeb3c7cf59..f01967af0e6ad 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp @@ -159,7 +159,7 @@ class __SYCL_EXPORT modifiable_command_graph /// Get a list of all root nodes (nodes without dependencies) in this graph. std::vector get_root_nodes() const; - /// Query whether the graph contains no nodes. + /// Returns true if the graph contains no nodes. bool empty() const; /// Common Reference Semantics diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index c67666d74f44d..4316a5d6b658c 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -67,6 +67,14 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { sycl::make_error_code(sycl::errc::feature_not_supported), "Only device backed asynchronous allocations are supported!"); + // Allocations not supported in graph native recording mode + if (auto *Queue = h.impl->get_queue_or_null(); + Queue && Queue->isNativeRecording()) { + throw sycl::exception( + sycl::make_error_code(sycl::errc::invalid), + "async_malloc is not supported in native recording mode."); + } + detail::adapter_impl &Adapter = h.getContextImpl().getAdapter(); // Get CG event dependencies for this allocation. @@ -117,6 +125,14 @@ __SYCL_EXPORT void *async_malloc(const sycl::queue &q, sycl::usm::alloc kind, __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, const memory_pool &pool) { + // Allocations not supported in graph native recording mode + if (auto *Queue = h.impl->get_queue_or_null(); + Queue && Queue->isNativeRecording()) { + throw sycl::exception( + sycl::make_error_code(sycl::errc::invalid), + "async_malloc is not supported in native recording mode."); + } + detail::adapter_impl &Adapter = h.getContextImpl().getAdapter(); detail::memory_pool_impl &memPoolImpl = *detail::getSyclObjImpl(pool); @@ -181,6 +197,14 @@ __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) { } } + // Not supported in graph native recording mode + if (auto *Queue = h.impl->get_queue_or_null(); + Queue && Queue->isNativeRecording()) { + throw sycl::exception( + sycl::make_error_code(sycl::errc::invalid), + "async_free is not supported in native recording mode."); + } + h.impl->MFreePtr = ptr; h.setType(detail::CGType::AsyncFree); } diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 5ff579d1fca24..9e6694050ba68 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -19,6 +19,7 @@ #include // ProgramManager #include // for queue_impl #include // for SYCLMemObjT +#include // for UR APIs #include // for stack #include // for tls_code_loc_t etc.. #include // for kernel_param_kind_t @@ -313,6 +314,40 @@ graph_impl::graph_impl(const sycl::context &SyclContext, if (PropList.has_property()) { MAllowBuffers = true; } + if (PropList.has_property()) { + // Create native UR graph when native recording is enabled + // Note: Native recording only works with immediate command lists, + // this is validated when recording begins + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + + // Check if the device supports graph record and replay + sycl::detail::device_impl &DeviceImpl = + *sycl::detail::getSyclObjImpl(MDevice); + + ur_bool_t SupportsGraphRecordReplay = false; + ur_result_t Result = + Adapter.call_nocheck( + DeviceImpl.getHandleRef(), + UR_DEVICE_INFO_GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP, + sizeof(ur_bool_t), &SupportsGraphRecordReplay, nullptr); + if (Result != UR_RESULT_SUCCESS || !SupportsGraphRecordReplay) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Device does not support graph record and replay feature " + "(UR_DEVICE_INFO_GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP)."); + } + + Result = Adapter.call_nocheck( + ContextImpl.getHandleRef(), &MNativeGraphHandle); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to create native UR graph"); + } + assert(MNativeGraphHandle != nullptr && + "Native UR graph handle should not be null if graph creation " + "succeeded"); + } if (!SyclDevice.has(aspect::ext_oneapi_limited_graph) && !SyclDevice.has(aspect::ext_oneapi_graph)) { @@ -331,6 +366,20 @@ graph_impl::~graph_impl() { for (auto &MemObj : MMemObjs) { MemObj->markNoLongerBeingUsedInGraph(); } + // Clean up native UR graph if it was created + if (MNativeGraphHandle) { + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + + ur_result_t Result = + Adapter.call_nocheck( + MNativeGraphHandle); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to destroy native UR graph"); + } + MNativeGraphHandle = nullptr; + } } catch (std::exception &e) { __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~graph_impl", e); } @@ -403,6 +452,14 @@ void graph_impl::markCGMemObjs( } node_impl &graph_impl::add(nodes_range Deps) { + // Native recording limitation: explicit API not supported + if (MNativeGraphHandle) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "graph.add(): The explicit graph API is not supported in native " + "recording mode. Use the record-and-replay API instead."); + } + node_impl &NodeImpl = createNode(); addDepsToNode(NodeImpl, Deps); @@ -415,6 +472,14 @@ node_impl &graph_impl::add(nodes_range Deps) { node_impl &graph_impl::add(std::function CGF, const std::vector &Args, nodes_range Deps) { + // Native recording limitation: explicit API not supported + if (MNativeGraphHandle) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "graph.add(): The explicit graph API is not supported in native " + "recording mode. Use the record-and-replay API instead."); + } + (void)Args; detail::handler_impl HandlerImpl{*this}; sycl::handler Handler{HandlerImpl}; @@ -485,6 +550,13 @@ node_impl &graph_impl::add(std::function CGF, node_impl &graph_impl::add(node_type NodeType, std::shared_ptr CommandGroup, nodes_range Deps) { + // Native recording limitation: explicit API not supported + if (MNativeGraphHandle) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "graph.add(): The explicit graph API is not supported in native " + "recording mode. Use the record-and-replay API instead."); + } // A unique set of dependencies obtained by checking requirements and events std::set UniqueDeps = getCGEdges(CommandGroup); @@ -512,6 +584,14 @@ node_impl &graph_impl::add(node_type NodeType, node_impl & graph_impl::add(std::shared_ptr &DynCGImpl, nodes_range Deps) { + // Native recording limitation: explicit API not supported + if (MNativeGraphHandle) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "graph.add(): The explicit graph API is not supported in native " + "recording mode. Use the record-and-replay API instead."); + } + // Set of Dependent nodes based on CG event and accessor dependencies. std::set DynCGDeps = getCGEdges(DynCGImpl->MCommandGroups[0]); for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) { @@ -559,6 +639,11 @@ void graph_impl::removeQueue(sycl::detail::queue_impl &RecordingQueue) { MRecordingQueues.erase(RecordingQueue.weak_from_this()); } +bool graph_impl::isQueueRecording(sycl::detail::queue_impl &Queue) { + + return MRecordingQueues.count(Queue.weak_from_this()) > 0; +} + void graph_impl::clearQueues(bool NeedsLock) { graph_impl::RecQueuesStorage SwappedQueues; { @@ -571,12 +656,44 @@ void graph_impl::clearQueues(bool NeedsLock) { for (auto &Queue : SwappedQueues) { if (auto ValidQueue = Queue.lock(); ValidQueue) { - ValidQueue->setCommandGraph(nullptr); + if (MNativeGraphHandle) { + // End native UR graph capture + auto UrQueue = ValidQueue->getHandleRef(); + ur_exp_graph_handle_t CapturedGraph = nullptr; + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + ur_result_t Result = Adapter.call_nocheck< + sycl::detail::UrApiKind::urQueueEndGraphCaptureExp>(UrQueue, + &CapturedGraph); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to end native graph capture"); + } + // CapturedGraph should be the same as MNativeGraphHandle + } else { + // Only call setCommandGraph for traditional recording + ValidQueue->setCommandGraph(nullptr); + } } } } -bool graph_impl::empty() const { return MNodeStorage.empty(); } +bool graph_impl::empty() const { + + if (!MNativeGraphHandle) { + return MNodeStorage.empty(); + } + + bool IsEmptyResult = true; + if (getSyclObjImpl(MContext) + ->getAdapter() + .call_nocheck( + MNativeGraphHandle, &IsEmptyResult) != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to check if graph is empty"); + } + return IsEmptyResult; +} bool graph_impl::checkForCycles() { std::list SortedNodes; @@ -691,20 +808,61 @@ std::vector graph_impl::getExitNodesEvents( return Events; } -void graph_impl::beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue) { +void graph_impl::beginRecordingImpl(sycl::detail::queue_impl &Queue, + bool AcquireQueueLock) { graph_impl::WriteLock Lock(MMutex); + + // Native recording limitation: single queue at a time + if (MNativeGraphHandle && !MRecordingQueues.empty()) { + throw sycl::exception(make_error_code(errc::feature_not_supported), + "Recording the same graph to multiple queues is not " + "supported in native mode"); + } + + // Native recording limitation: in-order queues only + if (MNativeGraphHandle && !Queue.isInOrder()) { + throw sycl::exception(make_error_code(errc::feature_not_supported), + "Native recording only works with in-order queues"); + } + if (!Queue.hasCommandGraph()) { - Queue.setCommandGraphUnlocked(shared_from_this()); + + // Use native UR graph recording if enabled + if (MNativeGraphHandle) { + auto UrQueue = Queue.getHandleRef(); + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + + if (Queue.isNativeRecording()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Queue is already in native graph capture mode"); + } + + ur_result_t Result = Adapter.call_nocheck< + sycl::detail::UrApiKind::urQueueBeginCaptureIntoGraphExp>( + UrQueue, MNativeGraphHandle); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to begin native UR graph capture"); + } + } else { + // Non-native recording path + if (AcquireQueueLock) { + Queue.setCommandGraph(shared_from_this()); + } else { + Queue.setCommandGraphUnlocked(shared_from_this()); + } + } addQueue(Queue); } } +void graph_impl::beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue) { + beginRecordingImpl(Queue, /*AcquireQueueLock=*/false); +} + void graph_impl::beginRecording(sycl::detail::queue_impl &Queue) { - graph_impl::WriteLock Lock(MMutex); - if (!Queue.hasCommandGraph()) { - Queue.setCommandGraph(shared_from_this()); - addQueue(Queue); - } + beginRecordingImpl(Queue, /*AcquireQueueLock=*/true); } // Check if nodes do not require enqueueing and if so loop back through @@ -930,16 +1088,35 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, "Device does not support Command Graph update"); } } - // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. - duplicateNodes(); - if (auto PlaceholderQueuePtr = GraphImpl->getLastRecordedQueue()) { - MQueueImpl = std::move(PlaceholderQueuePtr); + // Create native UR executable graph if the modifiable graph uses native + // recording + if (isNativeRecordingEnabledForGraph(*GraphImpl)) { + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + ur_result_t Result = + Adapter + .call_nocheck( + GraphImpl->getNativeGraphHandle(), + &MNativeExecutableGraphHandle); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to instantiate native UR executable graph"); + } } else { - MQueueImpl = sycl::detail::queue_impl::create( - *sycl::detail::getSyclObjImpl(GraphImpl->getDevice()), - *sycl::detail::getSyclObjImpl(Context), sycl::async_handler{}, - sycl::property_list{}); + // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. + duplicateNodes(); + + // A placeholder queue is only required for enqueueNode and update + // operations which are only possible with the command buffer path. + if (auto PlaceholderQueuePtr = GraphImpl->getLastRecordedQueue()) { + MQueueImpl = std::move(PlaceholderQueuePtr); + } else { + MQueueImpl = sycl::detail::queue_impl::create( + *sycl::detail::getSyclObjImpl(GraphImpl->getDevice()), + *sycl::detail::getSyclObjImpl(Context), sycl::async_handler{}, + sycl::property_list{}); + } } } @@ -951,6 +1128,16 @@ exec_graph_impl::~exec_graph_impl() { sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); + // Clean up native UR executable graph if it was created + if (MNativeExecutableGraphHandle) { + ur_result_t Res = Adapter.call_nocheck< + sycl::detail::UrApiKind::urGraphExecutableGraphDestroyExp>( + MNativeExecutableGraphHandle); + if (Res == UR_RESULT_SUCCESS) { + MNativeExecutableGraphHandle = nullptr; + } + } + // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); @@ -1205,12 +1392,60 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, return SignalEvent; } +EventImplPtr +exec_graph_impl::enqueueNative(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded) { + // Create a list containing all the UR event handles in WaitEvents. + // WaitEvents is assumed to be safe for scheduler bypass and any + // host-task events that it contains can be ignored. + auto &WaitEvents = CGData.MEvents; + std::vector UrEventHandles{}; + UrEventHandles.reserve(WaitEvents.size()); + for (auto &SyclWaitEvent : WaitEvents) { + if (auto URHandle = SyclWaitEvent->getHandle()) { + UrEventHandles.push_back(URHandle); + } + } + + const size_t UrEnqueueWaitListSize = UrEventHandles.size(); + ur_event_handle_t *UrEnqueueWaitList = + UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); + EventImplPtr NewEvent = nullptr; + if (!EventNeeded) { + Queue.getAdapter().call( + Queue.getHandleRef(), MNativeExecutableGraphHandle, + UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr); + } else { + NewEvent = sycl::detail::event_impl::create_device_event(Queue); + NewEvent->setContextImpl(Queue.getContextImpl()); + NewEvent->setStateIncomplete(); + NewEvent->setSubmissionTime(); + ur_event_handle_t UrEvent = nullptr; + Queue.getAdapter().call( + Queue.getHandleRef(), MNativeExecutableGraphHandle, + UrEnqueueWaitListSize, UrEnqueueWaitList, &UrEvent); + NewEvent->setHandle(UrEvent); + NewEvent->setEventFromSubmittedExecCommandBuffer(true); + if (MEnableProfiling) { + NewEvent->setProfilingEnabled(MEnableProfiling); + } + } + return NewEvent; +} + std::pair exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { WriteLock Lock(MMutex); + // Use native recording path if available + if (MNativeExecutableGraphHandle) { + return {enqueueNative(Queue, std::move(CGData), EventNeeded), + /*SkipScheduler=*/true}; + } + // Command buffer path cleanupExecutionEvents(MSchedulerDependencies); CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), MSchedulerDependencies.end()); @@ -1412,6 +1647,11 @@ void exec_graph_impl::duplicateNodes() { } void exec_graph_impl::update(std::shared_ptr GraphImpl) { + if (MNativeExecutableGraphHandle) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "Graph update is not supported in native recording mode"); + } if (MDevice != GraphImpl->getDevice()) { throw sycl::exception( @@ -1484,6 +1724,11 @@ void exec_graph_impl::update(node_impl &Node) { } void exec_graph_impl::update(nodes_range Nodes) { + if (MNativeExecutableGraphHandle) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "Graph update is not supported in native recording mode"); + } if (!MIsUpdatable) { throw sycl::exception(sycl::make_error_code(errc::invalid), "update() cannot be called on a executable graph " @@ -1974,12 +2219,47 @@ void modifiable_command_graph::end_recording() { void modifiable_command_graph::end_recording(queue &RecordingQueue) { queue_impl &QueueImpl = *sycl::detail::getSyclObjImpl(RecordingQueue); - if (QueueImpl.getCommandGraph() == impl) { - QueueImpl.setCommandGraph(nullptr); + + // Check if this queue is recording to this graph + bool IsRecordingToThisGraph = false; + + if (isNativeRecordingEnabledForGraph(*impl)) { + // For native recording, check if queue is in our recording queue list graph_impl::WriteLock Lock(impl->MMutex); - impl->removeQueue(QueueImpl); + IsRecordingToThisGraph = impl->isQueueRecording(QueueImpl); + + if (IsRecordingToThisGraph) { + // End native UR graph capture + assert(impl->getNativeGraphHandle() && + "Native graph handle must be valid when ending native recording"); + auto UrQueue = QueueImpl.getHandleRef(); + ur_exp_graph_handle_t CapturedGraph = nullptr; + context_impl &ContextImpl = + *sycl::detail::getSyclObjImpl(impl->getContext()); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + ur_result_t Result = + Adapter + .call_nocheck( + UrQueue, &CapturedGraph); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to end native UR graph capture"); + } + assert(CapturedGraph == impl->getNativeGraphHandle() && + "Captured graph handle must match the graph's native handle"); + impl->removeQueue(QueueImpl); + } + } else { + // Traditional recording path + if (QueueImpl.getCommandGraph() == impl) { + QueueImpl.setCommandGraph(nullptr); + graph_impl::WriteLock Lock(impl->MMutex); + impl->removeQueue(QueueImpl); + IsRecordingToThisGraph = true; + } } - if (QueueImpl.hasCommandGraph()) + + if (QueueImpl.hasCommandGraph() && !IsRecordingToThisGraph) throw sycl::exception(sycl::make_error_code(errc::invalid), "end_recording called for a queue which is recording " "to a different graph."); @@ -2007,10 +2287,22 @@ void modifiable_command_graph::print_graph(sycl::detail::string_view pathstr, std::vector modifiable_command_graph::get_nodes() const { graph_impl::ReadLock Lock(impl->MMutex); + if (impl->getNativeGraphHandle()) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "get_nodes() is not supported for graphs created with the " + "enable_native_recording property."); + } return impl->nodes().to>(); } std::vector modifiable_command_graph::get_root_nodes() const { graph_impl::ReadLock Lock(impl->MMutex); + if (impl->getNativeGraphHandle()) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "get_root_nodes() is not supported for graphs created with the " + "enable_native_recording property."); + } return impl->roots().to>(); } @@ -2057,18 +2349,22 @@ executable_command_graph::executable_command_graph( } void executable_command_graph::finalizeImpl() { - impl->makePartitions(); - - // Handle any work required for graph-owned memory allocations - impl->finalizeMemoryAllocations(); - - auto Device = impl->getGraphImpl()->getDevice(); - for (auto Partition : impl->getPartitions()) { - if (!Partition->MIsHostTask) { - impl->createCommandBuffers(Device, Partition); + // Partitions and command buffers are not used for native recording and + // instantiation is fully performed in the exec_graph_impl constructor. + if (!impl->getNativeExecutableGraphHandle()) { + impl->makePartitions(); + + // Handle any work required for graph-owned memory allocations + impl->finalizeMemoryAllocations(); + + auto Device = impl->getGraphImpl()->getDevice(); + for (auto Partition : impl->getPartitions()) { + if (!Partition->MIsHostTask) { + impl->createCommandBuffers(Device, Partition); + } } + impl->buildRequirements(); } - impl->buildRequirements(); } void executable_command_graph::update( diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index a6c0110bd4ff4..8151285368fe1 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -23,6 +23,12 @@ #include // for shared_mutex #include // for vector +// Forward declare UR types +struct ur_exp_graph_handle_t_; +using ur_exp_graph_handle_t = ur_exp_graph_handle_t_ *; +struct ur_exp_executable_graph_handle_t_; +using ur_exp_executable_graph_handle_t = ur_exp_executable_graph_handle_t_ *; + // For testing of graph internals class GraphImplTest; @@ -308,21 +314,34 @@ class graph_impl : public std::enable_shared_from_this { /// Prints the contents of the graph to a text file in DOT format. /// @param FilePath Path to the output file. - /// @param Verbose If true, print additional information about the nodes such - /// as kernel args or memory access where applicable. + /// @param Verbose If true (and native recording is not enabled), print + /// additional information about the nodes such as kernel args or memory + /// access where applicable. void printGraphAsDot(const std::string FilePath, bool Verbose) const { - /// Vector of nodes visited during the graph printing - std::vector VisitedNodes; + if (MNativeGraphHandle) { + context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext); + sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter(); + ur_result_t Result = + Adapter.call_nocheck( + MNativeGraphHandle, FilePath.c_str()); + if (Result != UR_RESULT_SUCCESS) { + throw sycl::exception(sycl::make_error_code(errc::runtime), + "Failed to dump native UR graph contents"); + } + } else { + /// Vector of nodes visited during the graph printing + std::vector VisitedNodes; - std::fstream Stream(FilePath, std::ios::out); - Stream << "digraph dot {" << std::endl; + std::fstream Stream(FilePath, std::ios::out); + Stream << "digraph dot {" << std::endl; - for (node_impl &Node : roots()) - Node.printDotRecursive(Stream, VisitedNodes, Verbose); + for (node_impl &Node : roots()) + Node.printDotRecursive(Stream, VisitedNodes, Verbose); - Stream << "}" << std::endl; + Stream << "}" << std::endl; - Stream.close(); + Stream.close(); + } } /// Make an edge between two nodes in the graph. Performs some mandatory @@ -519,7 +538,26 @@ class graph_impl : public std::enable_shared_from_this { } } + /// Get the native UR graph handle for this graph. + /// @return Native UR graph handle, or nullptr if native recording is not + /// enabled. + ur_exp_graph_handle_t getNativeGraphHandle() const { + return MNativeGraphHandle; + } + + /// Check if a queue is currently recording to this graph. + /// @param Queue The queue to check. + /// @return True if the queue is recording to this graph, false otherwise. + bool isQueueRecording(sycl::detail::queue_impl &Queue); + private: + /// Common implementation for beginRecording and beginRecordingUnlockedQueue. + /// @param[in] Queue The queue to be recorded from. + /// @param[in] AcquireQueueLock Whether to acquire the queue lock when setting + /// command graph. + void beginRecordingImpl(sycl::detail::queue_impl &Queue, + bool AcquireQueueLock); + template node_impl &createNode(Ts &&...Args) { MNodeStorage.push_back( std::make_shared(std::forward(Args)...)); @@ -582,6 +620,14 @@ class graph_impl : public std::enable_shared_from_this { /// presence of the assume_buffer_outlives_graph property. bool MAllowBuffers = false; + /// Native UR graph handle used for native recording mode. + /// + /// This handle is non-null only when native recording is enabled via the + /// enable_native_recording property. + /// + /// @note Native recording requires immediate command lists. + ur_exp_graph_handle_t MNativeGraphHandle = nullptr; + /// Mapping from queues to barrier nodes. For each queue the last barrier /// node recorded to the graph from the queue is stored. std::map, node_impl *, @@ -600,6 +646,13 @@ class graph_impl : public std::enable_shared_from_this { std::atomic MExecGraphCount = 0; }; +/// Get whether native recording is enabled for this graph. +/// @param graph The graph_impl to check. +/// @return True if native recording is enabled, false otherwise. +inline bool isNativeRecordingEnabledForGraph(graph_impl const &graph) { + return graph.getNativeGraphHandle() != nullptr; +} + /// Class representing the implementation of command_graph. class exec_graph_impl { public: @@ -647,6 +700,16 @@ class exec_graph_impl { enqueue(sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); + /// Enqueue a native UR graph (used when native recording is enabled). + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data for waitlist event dependencies. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return Returns an event if requested and nullptr otherwise. + EventImplPtr enqueueNative(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded); + /// Iterates through all the nodes in the graph to build the list of /// accessor requirements for the whole graph and for each partition. void buildRequirements(); @@ -675,6 +738,13 @@ class exec_graph_impl { /// @return pointer to the graph_impl MGraphImpl const std::shared_ptr &getGraphImpl() const { return MGraphImpl; } + /// Query the native executable graph handle. + /// @return Native UR executable graph handle, or nullptr if not using native + /// recording. + ur_exp_executable_graph_handle_t getNativeExecutableGraphHandle() const { + return MNativeExecutableGraphHandle; + } + /// Query the vector of the partitions composing the exec_graph. /// @return Vector of partitions in execution order. const std::vector> &getPartitions() const { @@ -947,6 +1017,11 @@ class exec_graph_impl { /// If true, the graph profiling is enabled. bool MEnableProfiling; + /// Native UR executable graph handle for native recording mode + /// Only valid when the original modifiable graph was created with native + /// recording enabled + ur_exp_executable_graph_handle_t MNativeExecutableGraphHandle = nullptr; + // Stores a cache of node ids from modifiable graph nodes to the companion // node(s) in this graph. Used for quick access when updating this graph. std::multimap MIDCache; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b669a0fa63400..5719496c5ac33 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -531,6 +531,24 @@ EventImplPtr queue_impl::submit_barrier_direct_impl( return ResEvent; } +bool queue_impl::isNativeRecording() const { + bool IsGraphCaptureEnabled = false; + ur_result_t Result = + getAdapter().call_nocheck( + MQueue, &IsGraphCaptureEnabled); + return Result == UR_RESULT_SUCCESS && IsGraphCaptureEnabled; +} + +ext::oneapi::experimental::queue_state +queue_impl::ext_oneapi_get_state_impl() const { + // A graph may either be recording at the SYCL level or recording at a lower + // level API (e.g. UR) + if (hasCommandGraph() || isNativeRecording()) { + return ext::oneapi::experimental::queue_state::recording; + } + return ext::oneapi::experimental::queue_state::executing; +} + EventImplPtr queue_impl::submit_command_to_graph( ext::oneapi::experimental::detail::graph_impl &GraphImpl, std::unique_ptr CommandGroup, sycl::detail::CGType CGType, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 72844e7e796ef..cfea2e3b933c0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -646,6 +646,10 @@ class queue_impl : public std::enable_shared_from_this { bool hasCommandGraph() const { return !MGraph.expired(); } + bool isNativeRecording() const; + + ext::oneapi::experimental::queue_state ext_oneapi_get_state_impl() const; + EventImplPtr submit_command_to_graph( ext::oneapi::experimental::detail::graph_impl &GraphImpl, std::unique_ptr CommandGroup, sycl::detail::CGType CGType, diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 9a3464f887d15..9033d8640f0d9 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -80,7 +80,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_VIRTUAL_MEM 1 #define SYCL_EXT_ONEAPI_USM_MALLOC_PROPERTIES 1 #define SYCL_EXT_INTEL_CACHE_CONFIG 1 -#define SYCL_EXT_ONEAPI_GRAPH 1 +#define SYCL_EXT_ONEAPI_GRAPH 2 #define SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY 1 #define SYCL_EXT_ONEAPI_DEVICE_GLOBAL 1 #define SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST 1 diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 434b2be35b1fe..868ec56f3c888 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -752,9 +752,24 @@ detail::EventImplPtr handler::finalize() { return EventImpl; } - // Because graph case is handled right above. + // Because command graph case is handled right above. assert(Queue); + // Native graph recording limitation + if (Queue->isNativeRecording()) { + if (type == detail::CGType::CodeplayHostTask) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "SYCL host_task is not supported in native recording mode. Use " + "zeCommandListAppendHostFunction as a workaround."); + } + if (!CommandGroup->getRequirements().empty()) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "sycl::buffer accessors are not supported in native recording mode."); + } + } + // If the queue has an associated graph then we need to take the CG and pass // it to the graph to create a node, rather than submit it to the scheduler. if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) { diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 11271eaba796c..355e8fcb4c02b 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -76,9 +76,7 @@ context queue::get_context() const { return impl->get_context(); } device queue::get_device() const { return impl->get_device(); } ext::oneapi::experimental::queue_state queue::ext_oneapi_get_state() const { - return impl->hasCommandGraph() - ? ext::oneapi::experimental::queue_state::recording - : ext::oneapi::experimental::queue_state::executing; + return impl->ext_oneapi_get_state_impl(); } ext::oneapi::experimental::command_graph< diff --git a/sycl/test-e2e/Graph/Explicit/empty.cpp b/sycl/test-e2e/Graph/Explicit/empty.cpp new file mode 100644 index 0000000000000..2a76632891965 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/empty.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/empty.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/empty.cpp b/sycl/test-e2e/Graph/Inputs/empty.cpp new file mode 100644 index 0000000000000..f97c292d827a6 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/empty.cpp @@ -0,0 +1,28 @@ +// Tests the empty() method on modifiable command graphs. + +#include "../graph_common.hpp" + +#include + +int main() { + queue Queue{{property::queue::in_order{}}}; + +#ifdef GRAPH_E2E_NATIVE_RECORDING + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; +#else + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; +#endif + + // Test that empty() returns true when graph has 0 nodes + assert(Graph.empty() && "Graph should be empty with 0 nodes"); + + add_node(Graph, Queue, [&](handler &CGH) { CGH.single_task([=]() {}); }); + + // Test that empty() returns false when graph has 1 node + assert(!Graph.empty() && "Graph should not be empty with 1 node"); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp new file mode 100644 index 0000000000000..4e42bf307d18a --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include + +using DataType = int; + +namespace sycl_ext = sycl::ext::oneapi::experimental; + +void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) { + // one work-group copies data to shared memory from A + // And then puts in back into B + + DataType *smem_ptr = + reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); + auto threadIdx_x = it.get_local_linear_id(); + + smem_ptr[threadIdx_x] = a[threadIdx_x]; + sycl::group_barrier(it.get_group()); + + b[threadIdx_x] = smem_ptr[threadIdx_x]; +} + +template struct KernelFunctor { + T m_props; + DataType *m_a; + DataType *m_b; + KernelFunctor(T props, DataType *a, DataType *b) + : m_props(props), m_a(a), m_b(b) {} + + void operator()(sycl::nd_item<1> it) const { copy_via_smem(m_a, m_b, it); } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + +int main() { + sycl::queue queue{sycl::property::queue::in_order()}; +#ifdef GRAPH_E2E_NATIVE_RECORDING + sycl::ext::oneapi::experimental::command_graph graph( + queue.get_device(), + {sycl_ext::property::graph::enable_native_recording{}}); +#else + sycl::ext::oneapi::experimental::command_graph graph(queue.get_device()); +#endif + + auto size = std::min( + queue.get_device().get_info(), + size_t{1024}); + + DataType *a = sycl::malloc_device(size, queue); + DataType *b = sycl::malloc_device(size, queue); + std::vector a_host(size, 1.0); + std::vector b_host(size, -5.0); + + graph.begin_recording(queue); + queue.copy(a_host.data(), a, size); + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>({size}, {size}), + KernelFunctor(sycl_ext::properties{sycl_ext::work_group_scratch_size( + size * sizeof(DataType))}, + a, b)); + }); + + queue.copy(b, b_host.data(), size); + graph.end_recording(); + auto exec_graph = graph.finalize(); + + queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_graph(exec_graph); }) + .wait_and_throw(); + + for (size_t i = 0; i < b_host.size(); i++) { + assert(b_host[i] == a_host[i]); + } + sycl::free(a, queue); + sycl::free(b, queue); +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp new file mode 100644 index 0000000000000..e72250382fbc3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp @@ -0,0 +1,67 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test for enable_native_recording property using queue recording mode + +#include "../../graph_common.hpp" + +#include + +int main() { + // Create a queue with immediate command list property for native recording + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + int *Data = malloc_device(N, Queue); + + QueueStateVerifier verifier(Queue); + verifier.verify(EXECUTING); + + // Use queue recording mode to create the graph + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + // Record initialization kernel + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] = static_cast(idx); }); + }); + + // Record computation kernel + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] = Data[idx] * 2; }); + }); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + // Finalize and execute the graph + auto ExecutableGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + + Queue.wait(); + + // Verify results + std::vector HostData(N); + Queue.memcpy(HostData.data(), Data, N * sizeof(int)).wait(); + + for (size_t i = 0; i < N; i++) { + int Expected = static_cast(i) * 2; + assert(check_value(i, Expected, HostData[i], "HostData")); + } + + free(Data, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp new file mode 100644 index 0000000000000..08eca94d52703 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp @@ -0,0 +1,52 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out ; FileCheck %s --implicit-check-not=LEAK --input-file graph_native.dot %} + +// Test for native recording with debug print graph functionality. SYCL graph +// does not control the output format, so we only verify high level details of +// the output and rely on validation tests in L0 graph + +// CHECK: digraph +// CHECK: zeCommandListAppendMemoryFill +// CHECK: MyKernel +// CHECK: zeCommandListAppendMemoryCopy + +#include "../../graph_common.hpp" + +#include +#include + +class MyKernel; + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 128; + + int *DeviceData = malloc_device(N, Queue); + int *DeviceTemp = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + + exp_ext::memset(Queue, DeviceData, 0x42, N * sizeof(int)); + Queue.submit([&](handler &CGH) { + CGH.parallel_for( + range<1>{N}, [=](id<1> idx) { DeviceData[idx] = DeviceData[idx] + 1; }); + }); + exp_ext::memcpy(Queue, DeviceTemp, DeviceData, N * sizeof(int)); + + Graph.end_recording(Queue); + + Graph.print_graph("graph_native.dot"); + + free(DeviceData, Queue); + free(DeviceTemp, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp new file mode 100644 index 0000000000000..e31efe8a06118 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY +#define GRAPH_E2E_NATIVE_RECORDING + +#include "../../Inputs/empty.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp new file mode 100644 index 0000000000000..790d0e048fb23 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp @@ -0,0 +1,61 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + constexpr size_t N = 1024; + + Graph.begin_recording(Queue); + + // Test 1: async_malloc should throw during recording + void *AsyncPtr = nullptr; + if (!expectException( + [&]() { + Queue.submit([&](handler &CGH) { + AsyncPtr = exp_ext::async_malloc(CGH, usm::alloc::device, + N * sizeof(int)); + }); + }, + "async_malloc during native recording", sycl::errc::invalid)) { + Graph.end_recording(); + return 1; + } + + // Test 2: async_free should also throw during recording + // First allocate memory outside of recording + Graph.end_recording(); + int *PreAllocatedPtr = malloc_device(N, Queue); + Graph.begin_recording(Queue); + + if (!expectException( + [&]() { + Queue.submit([&](handler &CGH) { + exp_ext::async_free(CGH, PreAllocatedPtr); + }); + }, + "async_free during native recording", sycl::errc::invalid)) { + Graph.end_recording(); + free(PreAllocatedPtr, Queue); + return 1; + } + + Graph.end_recording(); + free(PreAllocatedPtr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp new file mode 100644 index 0000000000000..1e253ca136f89 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp @@ -0,0 +1,47 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + constexpr size_t N = 1024; + sycl::buffer Buf{N}; + + // Use assume_buffer_outlives_graph so the graph-level buffer check doesn't + // fire before the native-recording check in the handler. + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}, + exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + // Submitting a kernel that uses a buffer accessor should throw + // errc::feature_not_supported in native recording mode. + if (!expectException( + [&]() { + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(sycl::range<1>{N}, + [=](sycl::id<1> Idx) { Acc[Idx] = 0; }); + }); + }, + "buffer accessor in native recording mode", + sycl::errc::feature_not_supported)) { + Graph.end_recording(); + return 1; + } + + Graph.end_recording(); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp new file mode 100644 index 0000000000000..5f7ba8452ff8a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp @@ -0,0 +1,61 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + int *Data = malloc_device(N, Queue); + + // Test 1: Graph.add() should throw when native recording is enabled + if (!expectException( + [&]() { + auto node = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] = idx; }); + }); + }, + "Graph.add() with native recording enabled", + sycl::errc::feature_not_supported)) { + free(Data, Queue); + return 1; + } + + // Test 2: Try using Graph.add() with dependencies - should also throw + if (!expectException( + [&]() { + auto node1 = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] = idx; }); + }); + + auto node2 = Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] *= 2; }); + }, + {exp_ext::property::node::depends_on{node1}}); + }, + "Graph.add() with dependencies", + sycl::errc::feature_not_supported)) { + free(Data, Queue); + return 1; + } + + free(Data, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp new file mode 100644 index 0000000000000..2d9beb56c1700 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp @@ -0,0 +1,35 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + // get_nodes() should throw errc::invalid for native recording graphs + if (!expectException([&]() { Graph.get_nodes(); }, + "get_nodes() with enable_native_recording", + sycl::errc::invalid)) { + return 1; + } + + // get_root_nodes() should throw errc::invalid for native recording graphs + if (!expectException([&]() { Graph.get_root_nodes(); }, + "get_root_nodes() with enable_native_recording", + sycl::errc::invalid)) { + return 1; + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp new file mode 100644 index 0000000000000..e009df3cde814 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp @@ -0,0 +1,48 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + constexpr size_t N = 1024; + int *Data = malloc_shared(N, Queue); + + Graph.begin_recording(Queue); + + // Try to record a SYCL host_task - this should throw an exception + if (!expectException( + [&]() { + Queue.submit([&](handler &CGH) { + CGH.host_task([=]() { + // This host task should not execute in native recording mode + for (size_t i = 0; i < N; i++) { + Data[i] = i + 100; + } + }); + }); + }, + "host_task in native recording", + sycl::errc::feature_not_supported)) { + Graph.end_recording(); + free(Data, Queue); + return 1; + } + + Graph.end_recording(); + free(Data, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp new file mode 100644 index 0000000000000..5c433f49cd2b0 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp @@ -0,0 +1,50 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + device Dev; + context Ctx{Dev}; + + // Create two in-order queues sharing the same device and context + queue Queue1{Ctx, Dev, {property::queue::in_order{}}}; + queue Queue2{Ctx, Dev, {property::queue::in_order{}}}; + + exp_ext::command_graph Graph{ + Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}}; + + constexpr size_t N = 1024; + int *Data = malloc_device(N, Dev, Ctx); + + Graph.begin_recording(Queue1); + + Queue1.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Data[idx] = idx; }); + }); + + // Try to start recording on Queue2 while Queue1 is still recording + const bool passed = expectException([&]() { Graph.begin_recording(Queue2); }, + "begin_recording on second queue", + sycl::errc::feature_not_supported); + + assert(Queue1.ext_oneapi_get_state() == exp_ext::queue_state::recording); + assert(Queue2.ext_oneapi_get_state() == exp_ext::queue_state::executing); + + Graph.end_recording(Queue1); + free(Data, Ctx); + + if (!passed) { + std::cerr << "Expected a thrown exception when starting recording twice" + << std::endl; + return 1; + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp new file mode 100644 index 0000000000000..b006446283f48 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp @@ -0,0 +1,27 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" + +#include + +int main() { + device Dev; + context Ctx{Dev}; + queue OutOfOrderQueue{Ctx, Dev}; + exp_ext::command_graph Graph{ + Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}}; + + if (!expectException([&]() { Graph.begin_recording(OutOfOrderQueue); }, + "begin_recording with out-of-order queue", + sycl::errc::feature_not_supported)) { + std::cerr << "Out-of-order queue should throw exception" << std::endl; + return 1; + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp new file mode 100644 index 0000000000000..0b0392c5fc8e1 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp @@ -0,0 +1,50 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test that update() throws when called on a graph in native recording mode + +#include "../../graph_common.hpp" + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + int *Data = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Data[idx] = idx; }); + }); + Graph.end_recording(); + + auto ExecGraph = Graph.finalize({exp_ext::property::graph::updatable{}}); + + if (!expectException([&]() { ExecGraph.update(Graph); }, + "update(graph) with native recording enabled", + sycl::errc::feature_not_supported)) { + free(Data, Queue); + return 1; + } + + if (!expectException( + [&]() { ExecGraph.update(std::vector{}); }, + "update(nodes) with native recording enabled", + sycl::errc::feature_not_supported)) { + free(Data, Queue); + return 1; + } + + free(Data, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp new file mode 100644 index 0000000000000..4c75294b4aea3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp @@ -0,0 +1,125 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// A pointer to a structure may be utilized to hold mutable kernel arguments +// without using mutable command lists as the pointer itself remains constant. +// This test demonstrates updating input/output pointers and scalar arguments +// between graph submissions with a single wait at the end. + +#include "../../graph_common.hpp" + +#include + +struct MutableArguments { + int *InputPtr; + int *OutputPtr; + int Multiplier; + int Addend; +}; + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + + int *Input1 = malloc_device(N, Queue); + int *Input2 = malloc_device(N, Queue); + int *Output1 = malloc_device(N, Queue); + int *Output2 = malloc_device(N, Queue); + + MutableArguments *DeviceArgs = malloc_device(1, Queue); + + std::vector HostInput1(N); + for (size_t i = 0; i < N; i++) { + HostInput1[i] = static_cast(i); + } + Queue.memcpy(Input1, HostInput1.data(), N * sizeof(int)).wait(); + + std::vector HostInput2(N); + for (size_t i = 0; i < N; i++) { + HostInput2[i] = static_cast(i * 2); + } + Queue.memcpy(Input2, HostInput2.data(), N * sizeof(int)).wait(); + + QueueStateVerifier verifier(Queue); + verifier.verify(EXECUTING); + + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + int input = DeviceArgs->InputPtr[idx]; + int multiplier = DeviceArgs->Multiplier; + int addend = DeviceArgs->Addend; + DeviceArgs->OutputPtr[idx] = input * multiplier + addend; + }); + }); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + + MutableArguments HostArgs1; + HostArgs1.InputPtr = Input1; + HostArgs1.OutputPtr = Output1; + HostArgs1.Multiplier = 2; + HostArgs1.Addend = 10; + + MutableArguments HostArgs2; + HostArgs2.InputPtr = Input2; + HostArgs2.OutputPtr = Output2; + HostArgs2.Multiplier = 3; + HostArgs2.Addend = 20; + + // Submit graph with first set of arguments (Input1 -> Output1) + Queue.memcpy(DeviceArgs, &HostArgs1, sizeof(MutableArguments)).wait(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + + // Update the device arguments via memcpy + Queue.memcpy(DeviceArgs, &HostArgs2, sizeof(MutableArguments)); + + // Submit graph again with second set of arguments (Input2 -> Output2) + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + + // Single wait for both graph submissions + Queue.wait(); + + // Read back both output buffers + std::vector HostOutput1(N); + std::vector HostOutput2(N); + Queue.memcpy(HostOutput1.data(), Output1, N * sizeof(int)).wait(); + Queue.memcpy(HostOutput2.data(), Output2, N * sizeof(int)).wait(); + + // Verify first execution results: + // Output1[i] = Input1[i] * 2 + 10 = i * 2 + 10 + for (size_t i = 0; i < N; i++) { + int Expected = static_cast(i) * 2 + 10; + assert(check_value(i, Expected, HostOutput1[i], "Output1")); + } + + // Verify second execution results: + // Output2[i] = Input2[i] * 3 + 20 = (i * 2) * 3 + 20 + for (size_t i = 0; i < N; i++) { + int Expected = static_cast(i) * 6 + 20; + assert(check_value(i, Expected, HostOutput2[i], "Output2")); + } + + free(Input1, Queue); + free(Input2, Queue); + free(Output1, Queue); + free(Output2, Queue); + free(DeviceArgs, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp new file mode 100644 index 0000000000000..1afebb84f8066 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp @@ -0,0 +1,74 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test for native recording with non-kernel memory operations using handlerless +// APIs: memcpy, memset, fill, copy + +#include "../../graph_common.hpp" + +#include +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 128; + + // Allocate two buffers + int *Data1 = malloc_device(2 * N, Queue); + int *Data2 = malloc_device(2 * N, Queue); + + // Host buffers for verification + std::vector HostData1(N); + std::vector HostData2(N); + + QueueStateVerifier verifier(Queue); + verifier.verify(EXECUTING); + + // Use queue recording mode to create the graph + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + // Test memory operations on two buffers: + // Data1: memset first half (byte-wise to 0x2A), D2D memcpy to second half and + // D2H memcpy for verification Data2: fill first half, D2D copy to second half + // and D2H copy for verification + + exp_ext::memset(Queue, Data1, 0x2A, N * sizeof(int)); + exp_ext::fill(Queue, Data2, 7, N); + + exp_ext::memcpy(Queue, Data1 + N, Data1, N * sizeof(int)); + exp_ext::memcpy(Queue, HostData1.data(), Data1 + N, N * sizeof(int)); + + exp_ext::copy(Queue, Data2, Data2 + N, N); + exp_ext::copy(Queue, Data2 + N, HostData2.data(), N); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + // Finalize and execute the graph + auto ExecutableGraph = Graph.finalize(); + + exp_ext::execute_graph(Queue, ExecutableGraph); + + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(check_value(i, 0x2A2A2A2A, HostData1[i], "Data1")); + assert(check_value(i, 7, HostData2[i], "Data2")); + } + + free(Data1, Queue); + free(Data2, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp new file mode 100644 index 0000000000000..baa1792fe0d6a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp @@ -0,0 +1,112 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test for enable_native_recording property with multi-queue dot product +// Assesses event dependencies to, from, and within a native recording graph + +#include "../../graph_common.hpp" + +#include +#include + +int main() { + // Create context and device + device Dev; + context Ctx{Dev}; + + // Create two in-order queues sharing the same device and context + queue Queue1{Ctx, Dev, {property::queue::in_order{}}}; + queue Queue2{Ctx, Dev, {property::queue::in_order{}}}; + + QueueStateVerifier verifier(Queue1, Queue2); + + exp_ext::command_graph Graph{ + Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + + // Allocate input vectors and partial result buffers + int *VecA = malloc_device(N, Dev, Ctx); + int *VecB = malloc_device(N, Dev, Ctx); + int *PartialResult1 = malloc_device(1, Dev, Ctx); + int *PartialResult2 = malloc_device(1, Dev, Ctx); + int *FinalResult = malloc_device(1, Dev, Ctx); + + verifier.verify(EXECUTING, EXECUTING); + + // Begin graph recording on Queue1 only + Graph.begin_recording(Queue1); + verifier.verify(RECORDING, EXECUTING); + + // Transform VecA + event Fork = + Queue1.parallel_for(range<1>{N}, [=](item<1> idx) { VecA[idx] *= 2; }); + + // Record partial dot product on first half with Queue2, transitioning to + // recording (fork) + event Join = Queue2.single_task({Fork}, [=]() { + int sum = 0; + for (size_t i = 0; i < N / 2; i++) { + sum += VecA[i] * VecB[i]; + } + PartialResult1[0] = sum; + }); + verifier.verify(RECORDING, RECORDING); + + // Record partial dot product on second half (Queue1) + exp_ext::single_task(Queue1, [=]() { + int sum = 0; + for (size_t i = N / 2; i < N; i++) { + sum += VecA[i] * VecB[i]; + } + PartialResult2[0] = sum; + }); + + // Record final reduction kernel with dependency on Queue2 event + Queue1.single_task({Join}, [=]() { + FinalResult[0] = PartialResult1[0] + PartialResult2[0]; + }); + + Graph.end_recording(); + verifier.verify(EXECUTING, EXECUTING); + + // Finalize and execute the graph + auto ExecutableGraph = Graph.finalize(); + + // Initialize input vector outside of graph. Use Queue2 to be able to test + // graph dependent event + event InitEvent = Queue2.parallel_for(range<1>{N}, [=](item<1> idx) { + VecA[idx] = static_cast(idx); + VecB[idx] = static_cast(idx) + 1; + }); + + auto GraphEvent = Queue1.ext_oneapi_graph(ExecutableGraph, {InitEvent}); + + // Wait for graph completion + GraphEvent.wait(); + + // Verify result + int HostResult = 0; + Queue1.memcpy(&HostResult, FinalResult, sizeof(int)); + Queue1.wait(); + + // Compute expected result + int Expected = 0; + for (int i = 0; i < N; i++) { + Expected += 2 * i * (i + 1); + } + + assert(check_value(0, Expected, HostResult, "DotProduct")); + + free(VecA, Ctx); + free(VecB, Ctx); + free(PartialResult1, Ctx); + free(PartialResult2, Ctx); + free(FinalResult, Ctx); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp new file mode 100644 index 0000000000000..bf79c40f82ce1 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp @@ -0,0 +1,109 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test for native recording with fork-join using barriers. Barriers are used to +// record two independent streams of operations without any dependencies between +// each other apart from the queue recording transition and join. + +#include "../../graph_common.hpp" + +#include +#include + +#include +#include + +int main() { + device Dev; + context Ctx{Dev}; + + queue Queue1{Ctx, Dev, {property::queue::in_order{}}}; + queue Queue2{Ctx, Dev, {property::queue::in_order{}}}; + + QueueStateVerifier verifier(Queue1, Queue2); + + exp_ext::command_graph Graph{ + Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + + int *VecA = malloc_device(N, Dev, Ctx); + int *VecB = malloc_device(N, Dev, Ctx); + + // Initialize both arrays to 555 + Queue1.fill(VecA, 555, N).wait(); + Queue1.fill(VecB, 555, N).wait(); + verifier.verify(EXECUTING, EXECUTING); + + // 1) record on Queue1 + Graph.begin_recording(Queue1); + verifier.verify(RECORDING, EXECUTING); + + // 2) barrier (ext_oneapi_barrier) and recording transition on Queue2 + event ForkBarrier = Queue1.ext_oneapi_submit_barrier(); + exp_ext::partial_barrier(Queue2, {ForkBarrier}); + verifier.verify(RECORDING, RECORDING); + + // 3) two streams of independent kernels on Queue1 and Queue2 + exp_ext::parallel_for(Queue1, range<1>{N}, + [=](item<1> idx) { VecA[idx] = 1; }); + exp_ext::parallel_for(Queue1, range<1>{N}, + [=](item<1> idx) { VecA[idx] += 1; }); + exp_ext::parallel_for(Queue1, range<1>{N}, + [=](item<1> idx) { VecA[idx] *= 2; }); + + exp_ext::parallel_for(Queue2, range<1>{N}, + [=](item<1> idx) { VecB[idx] = 2; }); + exp_ext::parallel_for(Queue2, range<1>{N}, + [=](item<1> idx) { VecB[idx] *= 3; }); + exp_ext::parallel_for(Queue2, range<1>{N}, + [=](item<1> idx) { VecB[idx] += 1; }); + + // 4) join barrier + event JoinBarrier = Queue2.ext_oneapi_submit_barrier(); + exp_ext::partial_barrier(Queue1, {JoinBarrier}); + + Graph.end_recording(); + verifier.verify(EXECUTING, EXECUTING); + + // Wait on both queues after recording + Queue1.wait(); + Queue2.wait(); + + // Verify that values are still at initial value (no eager execution) + std::vector HostVecA_Check(N); + std::vector HostVecB_Check(N); + Queue1.memcpy(HostVecA_Check.data(), VecA, N * sizeof(int)).wait(); + Queue1.memcpy(HostVecB_Check.data(), VecB, N * sizeof(int)).wait(); + + assert(std::count(HostVecA_Check.begin(), HostVecA_Check.end(), 555) == N); + assert(std::count(HostVecB_Check.begin(), HostVecB_Check.end(), 555) == N); + + // Finalize and execute the graph + auto ExecutableGraph = Graph.finalize(); + + exp_ext::execute_graph(Queue1, ExecutableGraph); + Queue1.wait(); + + // Verify results + std::vector HostVecA(N); + std::vector HostVecB(N); + + Queue1.memcpy(HostVecA.data(), VecA, N * sizeof(int)).wait(); + Queue1.memcpy(HostVecB.data(), VecB, N * sizeof(int)).wait(); + + // VecA: 1 + 1 * 2 = 4 + assert(std::count(HostVecA.begin(), HostVecA.end(), 4) == N); + + // VecB: 2 * 3 + 1 = 7 + assert(std::count(HostVecB.begin(), HostVecB.end(), 7) == N); + + free(VecA, Ctx); + free(VecB, Ctx); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp new file mode 100644 index 0000000000000..fea5be477cb03 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp @@ -0,0 +1,78 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s + +// Test for native recording with pipelined graph execution. This test verifies +// that multiple graph executions can be batched with only a single host wait +// call at the end, demonstrating that all graph submissions are non-blocking. + +#include "../../graph_common.hpp" + +#include +#include + +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::enable_native_recording{}}}; + + const size_t N = 1024; + int *Data = malloc_device(N, Queue); + + QueueStateVerifier verifier(Queue); + verifier.verify(EXECUTING); + + // Record graph with two kernels: add 3, then subtract 1 + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + exp_ext::parallel_for(Queue, range<1>{N}, [=](id<1> idx) { Data[idx] += 3; }); + exp_ext::parallel_for(Queue, range<1>{N}, [=](id<1> idx) { Data[idx] -= 1; }); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + + // Initialize buffer, execute graph 20 times, copy results, then wait once + exp_ext::memset(Queue, Data, 0, N * sizeof(int)); + + std::cerr << "BEGIN_GRAPH_PIPELINE" << std::endl; + for (int i = 0; i < 20; i++) { + exp_ext::execute_graph(Queue, ExecutableGraph); + } + + std::vector HostData(N); + exp_ext::memcpy(Queue, HostData.data(), Data, N * sizeof(int)); + Queue.wait(); + std::cerr << "END_GRAPH_PIPELINE" << std::endl; + + // Verify results: 20 iterations of (add 3, subtract 1) = 40 + const int Expected = 40; + for (size_t i = 0; i < N; i++) { + assert(check_value(i, Expected, HostData[i], "HostData")); + } + + free(Data, Queue); + return 0; +} + +// Verify that there is exactly one host synchronization in the pipeline +// execution. The graph executions and memcpy should be batched asynchronously +// with only a single synchronization point at Queue.wait(). +// +// At the L0 level, we should see exactly one zeCommandListHostSynchronize call +// (from Queue.wait()) and zero zeEventHostSynchronize calls in the execution +// region. +// +// CHECK-LABEL: BEGIN_GRAPH_PIPELINE +// CHECK-NOT: zeEventHostSynchronize( +// CHECK-COUNT-1: zeCommandListHostSynchronize( +// CHECK-NOT: zeCommandListHostSynchronize( +// CHECK: END_GRAPH_PIPELINE diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp new file mode 100644 index 0000000000000..0e0bff07e173d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp @@ -0,0 +1,9 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_NATIVE_RECORDING +#include "../../Inputs/work_group_scratch_memory.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp new file mode 100644 index 0000000000000..7e8fa31ffc86e --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp @@ -0,0 +1,91 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 +// REQUIRES: level_zero_dev_kit + +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#include "../../graph_common.hpp" +#include "../../ze_common.hpp" + +#include +#include + +constexpr size_t N = 1024; + +void ZE_APICALL HostFunction(void *UserData) { + uint32_t *Data = static_cast(UserData); + for (size_t i = 0; i < N; i++) { + Data[i] = Data[i] * 3; + } +} + +int main() { + queue Queue{property::queue::in_order{}}; + + const sycl::context Context = Queue.get_context(); + const sycl::device Device = Queue.get_device(); + + ze_driver_handle_t ZeDriver; + ASSERT_ZE_RESULT_SUCCESS(getDriver(ZeDriver)); + + zeCommandListAppendHostFunction_fn zeCommandListAppendHostFunction = nullptr; + ASSERT_ZE_RESULT_SUCCESS( + loadZeExtensionFunction(ZeDriver, "zeCommandListAppendHostFunction", + zeCommandListAppendHostFunction)); + + // Allocate shared memory (accessible from both device and host) + uint32_t *DataShared = malloc_shared(N, Queue); + + // Get the command list before recording starts + ze_command_list_handle_t ZeCommandList; + bool success = getCommandListFromQueue(Queue, ZeCommandList); + assert(success); + + exp_ext::command_graph Graph{ + Context, Device, {exp_ext::property::graph::enable_native_recording{}}}; + + CommandListStateVerifier verifier(ZeCommandList); + verifier.verify(EXECUTING); + + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + // 1. Record SYCL kernel - initialize data + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { DataShared[idx] = idx + 10; }); + }); + + // 2. Record SYCL kernel - multiply by 2 + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { DataShared[idx] = DataShared[idx] * 2; }); + }); + + // 3. Record L0 host function directly to the recording command list + // The host function will run on the host but is part of the command list, + // so it will execute after the kernels complete + + ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendHostFunction( + ZeCommandList, reinterpret_cast(HostFunction), + static_cast(DataShared), nullptr, nullptr, 0, nullptr)); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + Queue.wait(); + + // Verify results + // SYCL kernel: Data = (i + 10) * 2 + // Host function: Data = (i + 10) * 2 * 3 + for (size_t i = 0; i < N; i++) { + uint32_t Expected = (i + 10) * 2 * 3; + assert(check_value(i, Expected, DataShared[i], "DataShared")); + } + + free(DataShared, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp new file mode 100644 index 0000000000000..068b9b6aaa4ee --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp @@ -0,0 +1,108 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 +// REQUIRES: level_zero_dev_kit + +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out %S/../../Inputs/Kernels/saxpy.spv +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out %S/../../Inputs/Kernels/saxpy.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test native recording with intermixed SYCL and Level-Zero kernels. + +#include "../../graph_common.hpp" +#include "../../ze_common.hpp" + +#include +#include + +int main(int, char **argv) { + queue Queue{property::queue::in_order{}}; + + const sycl::context Context = Queue.get_context(); + const sycl::device Device = Queue.get_device(); + + const size_t N = 1024; + uint32_t *DataX = malloc_device(N, Queue); + uint32_t *DataZ = malloc_device(N, Queue); + + std::vector HostX(N); + std::vector HostZ(N); + for (size_t i = 0; i < N; i++) { + HostX[i] = i + 10; + HostZ[i] = i + 1; + } + + Queue.memcpy(DataX, HostX.data(), N * sizeof(uint32_t)).wait(); + Queue.memcpy(DataZ, HostZ.data(), N * sizeof(uint32_t)).wait(); + + ZeKernelFactory KernelFactory(Queue); + ze_module_handle_t ZeModule = + KernelFactory.createModule(loadSpirvFromFile(argv[1])); + ze_kernel_handle_t ZeKernel = KernelFactory.createKernel( + ZeModule, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E5saxpy"); + + exp_ext::command_graph Graph{ + Context, Device, {exp_ext::property::graph::enable_native_recording{}}}; + + ze_command_list_handle_t ZeCommandList; + bool result = getCommandListFromQueue(Queue, ZeCommandList); + assert(result); + + CommandListStateVerifier verifier(ZeCommandList); + verifier.verify(EXECUTING); + + // Begin recording + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + // 1. Record SYCL kernel - multiply X by 2 + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { DataX[idx] = DataX[idx] * 2; }); + }); + + // 2. Record L0 kernel directly to the recording command list + + // Suggest and prepare group size + uint32_t GroupSizeX; + uint32_t GroupSizeY; + uint32_t GroupSizeZ; + + ASSERT_ZE_RESULT_SUCCESS(zeKernelSuggestGroupSize( + ZeKernel, N, 1, 1, &GroupSizeX, &GroupSizeY, &GroupSizeZ)); + + // Prepare kernel arguments using the WithArguments API + // saxpy computes Z = X * 2 + Z + // Arguments are passed as an array of pointers + void *ArgPointers[] = {&DataZ, &DataX}; + + ze_group_count_t ZeGroupCount{static_cast(N) / GroupSizeX, 1, 1}; + ze_group_size_t ZeGroupSize{GroupSizeX, GroupSizeY, GroupSizeZ}; + + ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendLaunchKernelWithArguments( + ZeCommandList, ZeKernel, ZeGroupCount, ZeGroupSize, ArgPointers, nullptr, + nullptr, 0, nullptr)); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + Queue.wait(); + + Queue.memcpy(HostX.data(), DataX, N * sizeof(uint32_t)).wait(); + Queue.memcpy(HostZ.data(), DataZ, N * sizeof(uint32_t)).wait(); + + // Verify results + // SYCL kernel: X = (i + 10) * 2 + // L0 saxpy kernel: Z = X * 2 + Z = (i + 10) * 2 * 2 + (i + 1) + for (size_t i = 0; i < N; i++) { + uint32_t ExpectedX = (i + 10) * 2; + uint32_t ExpectedZ = (i + 10) * 2 * 2 + (i + 1); + assert(check_value(i, ExpectedX, HostX[i], "HostX")); + assert(check_value(i, ExpectedZ, HostZ[i], "HostZ")); + } + + free(DataX, Queue); + free(DataZ, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp new file mode 100644 index 0000000000000..866579304315d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp @@ -0,0 +1,81 @@ +// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21 +// REQUIRES: level_zero_dev_kit + +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Test native recording with Level Zero memory operations. +// Records L0 memset, device-to-device copy, and device-to-host copy +// directly to the recording command list over an in-order queue. + +#include "../../graph_common.hpp" +#include "../../ze_common.hpp" + +#include +#include + +int main() { + queue Queue{property::queue::in_order{}}; + + const sycl::context Context = Queue.get_context(); + const sycl::device Device = Queue.get_device(); + + const size_t N = 1024; + uint32_t *DeviceSrc = malloc_device(N, Queue); + uint32_t *DeviceDst = malloc_device(N, Queue); + uint32_t *HostDst = malloc_host(N, Queue); + + for (size_t i = 0; i < N; i++) { + HostDst[i] = 0; + } + + ze_command_list_handle_t ZeCommandList; + bool success = getCommandListFromQueue(Queue, ZeCommandList); + assert(success); + + exp_ext::command_graph Graph{ + Context, Device, {exp_ext::property::graph::enable_native_recording{}}}; + + CommandListStateVerifier verifier(ZeCommandList); + verifier.verify(EXECUTING); + + Graph.begin_recording(Queue); + + verifier.verify(RECORDING); + + // 1. Level Zero memset - fill DeviceSrc with pattern 0x42 (byte pattern) + uint32_t Pattern = 0x42; + ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendMemoryFill( + ZeCommandList, DeviceSrc, &Pattern, sizeof(uint32_t), + N * sizeof(uint32_t), nullptr, 0, nullptr)); + + // 2. Level Zero device-to-device copy - copy DeviceSrc to DeviceDst + ASSERT_ZE_RESULT_SUCCESS( + zeCommandListAppendMemoryCopy(ZeCommandList, DeviceDst, DeviceSrc, + N * sizeof(uint32_t), nullptr, 0, nullptr)); + + // 3. Level Zero device-to-host copy - copy DeviceDst to HostDst + ASSERT_ZE_RESULT_SUCCESS( + zeCommandListAppendMemoryCopy(ZeCommandList, HostDst, DeviceDst, + N * sizeof(uint32_t), nullptr, 0, nullptr)); + + Graph.end_recording(Queue); + + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + Queue.wait(); + + // Verify results on host + for (size_t i = 0; i < N; i++) { + assert(check_value(i, Pattern, HostDst[i], "HostDst")); + } + + free(DeviceSrc, Queue); + free(DeviceDst, Queue); + free(HostDst, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/empty.cpp new file mode 100644 index 0000000000000..c73141de90611 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/empty.cpp @@ -0,0 +1,8 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/empty.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp index a3f01010dcb90..ab8fae8f59b0f 100644 --- a/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp @@ -13,76 +13,4 @@ // XFAIL: hip // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16072 -#include -#include -#include -#include -#include -#include - -using DataType = int; - -namespace sycl_ext = sycl::ext::oneapi::experimental; - -void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) { - // one work-group copies data to shared memory from A - // And then puts in back into B - - DataType *smem_ptr = - reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); - auto threadIdx_x = it.get_local_linear_id(); - - smem_ptr[threadIdx_x] = a[threadIdx_x]; - sycl::group_barrier(it.get_group()); - - b[threadIdx_x] = smem_ptr[threadIdx_x]; -} - -template struct KernelFunctor { - T m_props; - DataType *m_a; - DataType *m_b; - KernelFunctor(T props, DataType *a, DataType *b) - : m_props(props), m_a(a), m_b(b) {} - - void operator()(sycl::nd_item<1> it) const { copy_via_smem(m_a, m_b, it); } - auto get(sycl_ext::properties_tag) const { return m_props; } -}; - -int main() { - sycl::queue queue{sycl::property::queue::in_order()}; - sycl::ext::oneapi::experimental::command_graph graph(queue.get_device()); - - auto size = std::min( - queue.get_device().get_info(), - size_t{1024}); - - DataType *a = sycl::malloc_device(size, queue); - DataType *b = sycl::malloc_device(size, queue); - std::vector a_host(size, 1.0); - std::vector b_host(size, -5.0); - - graph.begin_recording(queue); - queue.copy(a_host.data(), a, size); - - queue.submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<1>({size}, {size}), - KernelFunctor(sycl_ext::properties{sycl_ext::work_group_scratch_size( - size * sizeof(DataType))}, - a, b)); - }); - - queue.copy(b, b_host.data(), size); - graph.end_recording(); - auto exec_graph = graph.finalize(); - - queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_graph(exec_graph); }) - .wait_and_throw(); - - for (size_t i = 0; i < b_host.size(); i++) { - assert(b_host[i] == a_host[i]); - } - sycl::free(a, queue); - sycl::free(b, queue); -} +#include "../Inputs/work_group_scratch_memory.cpp" diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index c99782732ec21..fb6da631bdc4e 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -6,10 +6,12 @@ #include +#include #include // std::conditional_variable #include #include // std::mutex, std::unique_lock #include +#include #if GRAPH_TESTS_VERBOSE_PRINT #include @@ -25,6 +27,12 @@ namespace exp_ext = sycl::ext::oneapi::experimental; // Make tests less verbose by using sycl namespace. using namespace sycl; +// Queue state constants for recording tests +inline constexpr exp_ext::queue_state RECORDING = + exp_ext::queue_state::recording; +inline constexpr exp_ext::queue_state EXECUTING = + exp_ext::queue_state::executing; + // Helper functions for wrapping depends_on calls when add_node is used so they // are not used in the explicit API template inline void depends_on_helper(sycl::handler &CGH, T Dep) { @@ -442,6 +450,42 @@ class Barrier { std::size_t threadNum; }; +// Verify recording states of one or more queues +template class QueueStateVerifier { + std::array queues; + +public: + template + QueueStateVerifier(Queues... qs) : queues{qs...} {} + + template void verify(States... expected_states) { + verifyImpl(std::index_sequence_for{}, expected_states...); + } + +private: + template + void verifyImpl(std::index_sequence, States... expected_states) { + (checkQueue(Is, queues[Is], expected_states), ...); + } + + void checkQueue(size_t index, queue q, exp_ext::queue_state expected) { + auto actual = q.ext_oneapi_get_state(); + if (actual != expected) { + std::cerr << "Queue " << index << " SYCL state mismatch: expected " + << stateToString(expected) << " but got " + << stateToString(actual) << std::endl; + assert(false); + } + } + + const char *stateToString(exp_ext::queue_state state) { + return state == exp_ext::queue_state::recording ? "recording" : "executing"; + } +}; + +template +QueueStateVerifier(Queues...) -> QueueStateVerifier; + template bool inline check_value(const T &Ref, const T &Got, const std::string &VariableName) { @@ -545,3 +589,25 @@ bool compareProfiling(event Event1, event Event2) { return (Pass1 && Pass2); } + +/// Helper to test that a callable throws a sycl::exception with the given +/// error code. Returns true if the exception was thrown with the expected +/// code, false otherwise. +template +bool expectException(Func &&Operation, const char *OperationName, + sycl::errc ExpectedCode) { + try { + Operation(); + std::cerr << "ERROR: Expected exception was not thrown for " + << OperationName << std::endl; + return false; + } catch (const sycl::exception &e) { + if (e.code() != sycl::make_error_code(ExpectedCode)) { + std::cerr << "ERROR: Wrong exception error code for " << OperationName + << ": expected " << sycl::make_error_code(ExpectedCode).message() + << ", got " << e.code().message() << std::endl; + return false; + } + return true; + } +} diff --git a/sycl/test-e2e/Graph/ze_common.hpp b/sycl/test-e2e/Graph/ze_common.hpp new file mode 100644 index 0000000000000..9bd1c87cd6699 --- /dev/null +++ b/sycl/test-e2e/Graph/ze_common.hpp @@ -0,0 +1,208 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_ZE_RESULT_SUCCESS(status) assert((status) == ZE_RESULT_SUCCESS); + +// These are currently defined in experimental graph headers. If they are aren't +// defined, then define them manually. +#ifndef ZE_RESULT_QUERY_TRUE +#define ZE_RESULT_QUERY_TRUE ((ze_result_t)0x7fff0000) +#endif +#ifndef ZE_RESULT_QUERY_FALSE +#define ZE_RESULT_QUERY_FALSE ((ze_result_t)0x7fff0001) +#endif + +inline ze_result_t getDriver(ze_driver_handle_t &ZeDriver) { + uint32_t DriverCount = 0; + ze_result_t status = zeDriverGet(&DriverCount, nullptr); + if (status != ZE_RESULT_SUCCESS) { + return status; + } + + if (DriverCount == 0) { + std::cout << "No Level Zero drivers found" << std::endl; + return ZE_RESULT_ERROR_DEVICE_LOST; + } + + std::vector Drivers(DriverCount); + status = zeDriverGet(&DriverCount, Drivers.data()); + ZeDriver = Drivers[0]; + return status; +} + +inline std::vector loadSpirvFromFile(const std::string &FileName) { + std::ifstream SpvStream(FileName, std::ios::binary); + SpvStream.seekg(0, std::ios::end); + size_t sz = SpvStream.tellg(); + SpvStream.seekg(0); + std::vector Spv(sz); + SpvStream.read(reinterpret_cast(Spv.data()), sz); + return Spv; +} + +inline bool getCommandListFromQueue(sycl::queue &Queue, + ze_command_list_handle_t &ZeCommandList) { + using namespace sycl; + auto ZeQueueNative = get_native(Queue); + + if (!std::holds_alternative(ZeQueueNative)) { + return false; + } + + ZeCommandList = std::get(ZeQueueNative); + return true; +} + +typedef ze_result_t(ZE_APICALL *zeCommandListAppendHostFunction_fn)( + ze_command_list_handle_t, void *, void *, void *, ze_event_handle_t, + uint32_t, ze_event_handle_t *); + +typedef ze_result_t(ZE_APICALL *zeCommandListIsGraphCaptureEnabledExp_fn)( + ze_command_list_handle_t); + +template +inline ze_result_t loadZeExtensionFunction(ze_driver_handle_t ZeDriver, + const char *FunctionName, + FunctionPtr &Fn) { + ze_result_t status = zeDriverGetExtensionFunctionAddress( + ZeDriver, FunctionName, reinterpret_cast(&Fn)); + return status; +} + +// Factory for creating and managing Level Zero kernels and modules +// All resources are associated with the same context and device +// The factory stores a copy of the SYCL queue to ensure the underlying +// Level Zero context and device remain valid for the factory's lifetime +class ZeKernelFactory { +public: + explicit ZeKernelFactory(sycl::queue Queue) + : Queue(Queue), + Context(sycl::get_native( + Queue.get_context())), + Device(sycl::get_native( + Queue.get_device())) {} + + ~ZeKernelFactory() { cleanup(); } + + ZeKernelFactory() = delete; + ZeKernelFactory(const ZeKernelFactory &) = delete; + ZeKernelFactory &operator=(const ZeKernelFactory &) = delete; + ZeKernelFactory(ZeKernelFactory &&) noexcept = default; + ZeKernelFactory &operator=(ZeKernelFactory &&) noexcept = default; + + ze_module_handle_t createModule(const std::vector &Spirv) { + ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_IL_SPIRV, + Spirv.size(), + Spirv.data(), + nullptr, + nullptr}; + ze_module_handle_t module; + ze_result_t status = + zeModuleCreate(Context, Device, &moduleDesc, &module, nullptr); + ASSERT_ZE_RESULT_SUCCESS(status); + Modules.push_back(module); + return module; + } + + ze_kernel_handle_t createKernel(ze_module_handle_t Module, + const char *KernelName) { + ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, + KernelName}; + ze_kernel_handle_t kernel; + ze_result_t status = zeKernelCreate(Module, &kernelDesc, &kernel); + ASSERT_ZE_RESULT_SUCCESS(status); + Kernels.push_back(kernel); + return kernel; + } + + void cleanup() { + // Destroy kernels first (they depend on modules) + for (auto kernel : Kernels) { + ASSERT_ZE_RESULT_SUCCESS(zeKernelDestroy(kernel)); + } + Kernels.clear(); + + // Then destroy modules + for (auto module : Modules) { + ASSERT_ZE_RESULT_SUCCESS(zeModuleDestroy(module)); + } + Modules.clear(); + } + +private: + sycl::queue Queue; // Ensures context and device lifetime + ze_context_handle_t Context; + ze_device_handle_t Device; + std::vector Modules; + std::vector Kernels; +}; + +// Verify recording states of one or more command lists +template class CommandListStateVerifier { + std::array commandLists; + zeCommandListIsGraphCaptureEnabledExp_fn pfnIsGraphCaptureEnabled = nullptr; + +public: + template + CommandListStateVerifier(CommandLists... cmdLists) + : commandLists{cmdLists...} { + loadGraphIsCapturingExtension(); + } + + template void verify(States... expected_states) { + verifyImpl(std::index_sequence_for{}, expected_states...); + } + +private: + void loadGraphIsCapturingExtension() { + ze_driver_handle_t driver; + ASSERT_ZE_RESULT_SUCCESS(getDriver(driver)); + ASSERT_ZE_RESULT_SUCCESS( + loadZeExtensionFunction(driver, "zeCommandListIsGraphCaptureEnabledExp", + pfnIsGraphCaptureEnabled)); + } + + template + void verifyImpl(std::index_sequence, States... expected_states) { + (checkCommandList(Is, commandLists[Is], expected_states), ...); + } + + void checkCommandList(size_t index, ze_command_list_handle_t cmdList, + exp_ext::queue_state expected) { + exp_ext::queue_state actual = getCommandListState(cmdList); + + if (actual != expected) { + std::cerr << "CommandList " << index << " L0 state mismatch: expected " + << stateToString(expected) << " but got " + << stateToString(actual) << std::endl; + assert(false); + } + } + + exp_ext::queue_state + getCommandListState(ze_command_list_handle_t cmdList) const { + ze_result_t captureStatus = pfnIsGraphCaptureEnabled(cmdList); + return (captureStatus == ZE_RESULT_QUERY_TRUE) + ? exp_ext::queue_state::recording + : exp_ext::queue_state::executing; + } + + const char *stateToString(exp_ext::queue_state state) { + return state == exp_ext::queue_state::recording ? "recording" : "executing"; + } +}; + +template +CommandListStateVerifier(CommandLists...) + -> CommandListStateVerifier; diff --git a/sycl/test/basic_tests/property_traits.cpp b/sycl/test/basic_tests/property_traits.cpp index 808ba179b36f1..5d276cef62001 100644 --- a/sycl/test/basic_tests/property_traits.cpp +++ b/sycl/test/basic_tests/property_traits.cpp @@ -82,6 +82,8 @@ int main() { CHECK_IS_PROPERTY(ext::oneapi::experimental::property::graph::updatable); CHECK_IS_PROPERTY( ext::oneapi::experimental::property::graph::enable_profiling); + CHECK_IS_PROPERTY( + ext::oneapi::experimental::property::graph::enable_native_recording); // Node is_property CHECK_IS_PROPERTY( @@ -149,6 +151,10 @@ int main() { ext::oneapi::experimental::property::graph::enable_profiling, ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::modifiable>); + CHECK_IS_PROPERTY_OF( + ext::oneapi::experimental::property::graph::enable_native_recording, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>); // Node is_property_of CHECK_IS_PROPERTY_OF( @@ -193,6 +199,9 @@ int main() { CHECK_IS_NOT_PROPERTY_OF( ext::oneapi::experimental::property::graph::enable_profiling, NotASYCLObject); + CHECK_IS_NOT_PROPERTY_OF( + ext::oneapi::experimental::property::graph::enable_native_recording, + NotASYCLObject); CHECK_IS_NOT_PROPERTY_OF( ext::oneapi::experimental::property::node::depends_on_all_leaves, @@ -252,6 +261,8 @@ int main() { CHECK_IS_PROPERTY_V(ext::oneapi::experimental::property::graph::updatable); CHECK_IS_PROPERTY_V( ext::oneapi::experimental::property::graph::enable_profiling); + CHECK_IS_PROPERTY_V( + ext::oneapi::experimental::property::graph::enable_native_recording); // Node is_property_v CHECK_IS_PROPERTY_V( @@ -340,6 +351,10 @@ int main() { ext::oneapi::experimental::property::graph::enable_profiling, ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::modifiable>); + CHECK_IS_PROPERTY_OF_V( + ext::oneapi::experimental::property::graph::enable_native_recording, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>); // Node is_property_of_v CHECK_IS_PROPERTY_OF_V( @@ -384,6 +399,9 @@ int main() { CHECK_IS_NOT_PROPERTY_OF_V( ext::oneapi::experimental::property::graph::enable_profiling, NotASYCLObject); + CHECK_IS_NOT_PROPERTY_OF_V( + ext::oneapi::experimental::property::graph::enable_native_recording, + NotASYCLObject); CHECK_IS_NOT_PROPERTY_OF_V( ext::oneapi::experimental::property::node::depends_on_all_leaves, diff --git a/sycl/tools/abi_check.py b/sycl/tools/abi_check.py index 2b285e8917c98..9df8658619c67 100644 --- a/sycl/tools/abi_check.py +++ b/sycl/tools/abi_check.py @@ -205,7 +205,11 @@ def main(): if args.output is None: print("Please specify --output option. Quiting.") sys.exit(-2) - dump_symbols(args.target_library, args.output) + try: + dump_symbols(args.target_library, args.output) + except FileNotFoundError as e: + print(f"Could not find file named: {e.filename}") + sys.exit(-2) if __name__ == "__main__": diff --git a/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp b/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp index 35de3c0efef94..ebff77b6248a9 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp @@ -52,9 +52,9 @@ static zex_counter_based_event_exp_flags_t createZeFlags(queue_type queueType, if (queueType == QUEUE_IMMEDIATE) { zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE; - } else { - zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE; } + // Always set non immediate flag for compatibility with graph record & replay + zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE; return zeFlags; }