diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp
index dc10a5f079d53..12837c482fbbd 100644
--- a/sycl/include/sycl/detail/property_helper.hpp
+++ b/sycl/include/sycl/detail/property_helper.hpp
@@ -44,8 +44,9 @@ enum DataLessPropKind {
   GraphDependOnAllLeaves = 19,
   GraphUpdatable = 20,
   GraphEnableProfiling = 21,
+  GraphEnableNativeRecording = 22,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 21,
+  LastKnownDataLessPropKind = 22,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };
diff --git a/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def b/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def
index ce177a9bb0a16..451ddbb834be4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def
+++ b/sycl/include/sycl/ext/oneapi/experimental/detail/properties/graph_properties.def
@@ -22,5 +22,9 @@ __SYCL_DATA_LESS_PROP(property::graph, updatable, GraphUpdatable)
 /// events returned by submissions of the executable graph
 __SYCL_DATA_LESS_PROP(property::graph, enable_profiling, GraphEnableProfiling)
 
+/// Property passed to command_graph constructor to enable native recording
+/// capability for improved performance when supported by the backend.
+__SYCL_DATA_LESS_PROP(property::graph, enable_native_recording, GraphEnableNativeRecording)
+
 #undef __SYCL_DATA_LESS_PROP
 #undef __SYCL_MANUALLY_DEFINED_PROP
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp
index 089eeb3c7cf59..f01967af0e6ad 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph/modifiable_graph.hpp
@@ -159,7 +159,7 @@ class __SYCL_EXPORT modifiable_command_graph
   /// Get a list of all root nodes (nodes without dependencies) in this graph.
   std::vector<node> get_root_nodes() const;
 
-  /// Query whether the graph contains no nodes.
+  /// Returns true if the graph contains no nodes.
   bool empty() const;
 
   /// Common Reference Semantics
diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp
index c67666d74f44d..4316a5d6b658c 100644
--- a/sycl/source/detail/async_alloc.cpp
+++ b/sycl/source/detail/async_alloc.cpp
@@ -67,6 +67,14 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
         sycl::make_error_code(sycl::errc::feature_not_supported),
         "Only device backed asynchronous allocations are supported!");
 
+  // Allocations not supported in graph native recording mode
+  if (auto *Queue = h.impl->get_queue_or_null();
+      Queue && Queue->isNativeRecording()) {
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::invalid),
+        "async_malloc is not supported in native recording mode.");
+  }
+
   detail::adapter_impl &Adapter = h.getContextImpl().getAdapter();
 
   // Get CG event dependencies for this allocation.
@@ -117,6 +125,14 @@ __SYCL_EXPORT void *async_malloc(const sycl::queue &q, sycl::usm::alloc kind,
 __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
                                            const memory_pool &pool) {
 
+  // Allocations not supported in graph native recording mode
+  if (auto *Queue = h.impl->get_queue_or_null();
+      Queue && Queue->isNativeRecording()) {
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::invalid),
+        "async_malloc is not supported in native recording mode.");
+  }
+
   detail::adapter_impl &Adapter = h.getContextImpl().getAdapter();
   detail::memory_pool_impl &memPoolImpl = *detail::getSyclObjImpl(pool);
 
@@ -181,6 +197,14 @@ __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) {
     }
   }
 
+  // Not supported in graph native recording mode
+  if (auto *Queue = h.impl->get_queue_or_null();
+      Queue && Queue->isNativeRecording()) {
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::invalid),
+        "async_free is not supported in native recording mode.");
+  }
+
   h.impl->MFreePtr = ptr;
   h.setType(detail::CGType::AsyncFree);
 }
diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp
index 5ff579d1fca24..9e6694050ba68 100644
--- a/sycl/source/detail/graph/graph_impl.cpp
+++ b/sycl/source/detail/graph/graph_impl.cpp
@@ -19,6 +19,7 @@
 #include <detail/program_manager/program_manager.hpp> // ProgramManager
 #include <detail/queue_impl.hpp>                      // for queue_impl
 #include <detail/sycl_mem_obj_t.hpp>                  // for SYCLMemObjT
+#include <detail/ur.hpp>                              // for UR APIs
 #include <stack>                                      // for stack
 #include <sycl/detail/common.hpp>      // for tls_code_loc_t etc..
 #include <sycl/detail/kernel_desc.hpp> // for kernel_param_kind_t
@@ -313,6 +314,40 @@ graph_impl::graph_impl(const sycl::context &SyclContext,
   if (PropList.has_property<property::graph::assume_buffer_outlives_graph>()) {
     MAllowBuffers = true;
   }
+  if (PropList.has_property<property::graph::enable_native_recording>()) {
+    // Create native UR graph when native recording is enabled
+    // Note: Native recording only works with immediate command lists,
+    // this is validated when recording begins
+    context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+    sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+
+    // Check if the device supports graph record and replay
+    sycl::detail::device_impl &DeviceImpl =
+        *sycl::detail::getSyclObjImpl(MDevice);
+
+    ur_bool_t SupportsGraphRecordReplay = false;
+    ur_result_t Result =
+        Adapter.call_nocheck<sycl::detail::UrApiKind::urDeviceGetInfo>(
+            DeviceImpl.getHandleRef(),
+            UR_DEVICE_INFO_GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP,
+            sizeof(ur_bool_t), &SupportsGraphRecordReplay, nullptr);
+    if (Result != UR_RESULT_SUCCESS || !SupportsGraphRecordReplay) {
+      throw sycl::exception(
+          sycl::make_error_code(errc::invalid),
+          "Device does not support graph record and replay feature "
+          "(UR_DEVICE_INFO_GRAPH_RECORD_AND_REPLAY_SUPPORT_EXP).");
+    }
+
+    Result = Adapter.call_nocheck<sycl::detail::UrApiKind::urGraphCreateExp>(
+        ContextImpl.getHandleRef(), &MNativeGraphHandle);
+    if (Result != UR_RESULT_SUCCESS) {
+      throw sycl::exception(sycl::make_error_code(errc::runtime),
+                            "Failed to create native UR graph");
+    }
+    assert(MNativeGraphHandle != nullptr &&
+           "Native UR graph handle should not be null if graph creation "
+           "succeeded");
+  }
 
   if (!SyclDevice.has(aspect::ext_oneapi_limited_graph) &&
       !SyclDevice.has(aspect::ext_oneapi_graph)) {
@@ -331,6 +366,20 @@ graph_impl::~graph_impl() {
     for (auto &MemObj : MMemObjs) {
       MemObj->markNoLongerBeingUsedInGraph();
     }
+    // Clean up native UR graph if it was created
+    if (MNativeGraphHandle) {
+      context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+      sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+
+      ur_result_t Result =
+          Adapter.call_nocheck<sycl::detail::UrApiKind::urGraphDestroyExp>(
+              MNativeGraphHandle);
+      if (Result != UR_RESULT_SUCCESS) {
+        throw sycl::exception(sycl::make_error_code(errc::runtime),
+                              "Failed to destroy native UR graph");
+      }
+      MNativeGraphHandle = nullptr;
+    }
   } catch (std::exception &e) {
     __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~graph_impl", e);
   }
@@ -403,6 +452,14 @@ void graph_impl::markCGMemObjs(
 }
 
 node_impl &graph_impl::add(nodes_range Deps) {
+  // Native recording limitation: explicit API not supported
+  if (MNativeGraphHandle) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "graph.add(): The explicit graph API is not supported in native "
+        "recording mode. Use the record-and-replay API instead.");
+  }
+
   node_impl &NodeImpl = createNode();
 
   addDepsToNode(NodeImpl, Deps);
@@ -415,6 +472,14 @@ node_impl &graph_impl::add(nodes_range Deps) {
 node_impl &graph_impl::add(std::function<void(handler &)> CGF,
                            const std::vector<sycl::detail::ArgDesc> &Args,
                            nodes_range Deps) {
+  // Native recording limitation: explicit API not supported
+  if (MNativeGraphHandle) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "graph.add(): The explicit graph API is not supported in native "
+        "recording mode. Use the record-and-replay API instead.");
+  }
+
   (void)Args;
   detail::handler_impl HandlerImpl{*this};
   sycl::handler Handler{HandlerImpl};
@@ -485,6 +550,13 @@ node_impl &graph_impl::add(std::function<void(handler &)> CGF,
 node_impl &graph_impl::add(node_type NodeType,
                            std::shared_ptr<sycl::detail::CG> CommandGroup,
                            nodes_range Deps) {
+  // Native recording limitation: explicit API not supported
+  if (MNativeGraphHandle) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "graph.add(): The explicit graph API is not supported in native "
+        "recording mode. Use the record-and-replay API instead.");
+  }
 
   // A unique set of dependencies obtained by checking requirements and events
   std::set<node_impl *> UniqueDeps = getCGEdges(CommandGroup);
@@ -512,6 +584,14 @@ node_impl &graph_impl::add(node_type NodeType,
 node_impl &
 graph_impl::add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
                 nodes_range Deps) {
+  // Native recording limitation: explicit API not supported
+  if (MNativeGraphHandle) {
+    throw sycl::exception(
+        make_error_code(errc::feature_not_supported),
+        "graph.add(): The explicit graph API is not supported in native "
+        "recording mode. Use the record-and-replay API instead.");
+  }
+
   // Set of Dependent nodes based on CG event and accessor dependencies.
   std::set<node_impl *> DynCGDeps = getCGEdges(DynCGImpl->MCommandGroups[0]);
   for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) {
@@ -559,6 +639,11 @@ void graph_impl::removeQueue(sycl::detail::queue_impl &RecordingQueue) {
   MRecordingQueues.erase(RecordingQueue.weak_from_this());
 }
 
+bool graph_impl::isQueueRecording(sycl::detail::queue_impl &Queue) {
+
+  return MRecordingQueues.count(Queue.weak_from_this()) > 0;
+}
+
 void graph_impl::clearQueues(bool NeedsLock) {
   graph_impl::RecQueuesStorage SwappedQueues;
   {
@@ -571,12 +656,44 @@ void graph_impl::clearQueues(bool NeedsLock) {
 
   for (auto &Queue : SwappedQueues) {
     if (auto ValidQueue = Queue.lock(); ValidQueue) {
-      ValidQueue->setCommandGraph(nullptr);
+      if (MNativeGraphHandle) {
+        // End native UR graph capture
+        auto UrQueue = ValidQueue->getHandleRef();
+        ur_exp_graph_handle_t CapturedGraph = nullptr;
+        context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+        sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+        ur_result_t Result = Adapter.call_nocheck<
+            sycl::detail::UrApiKind::urQueueEndGraphCaptureExp>(UrQueue,
+                                                                &CapturedGraph);
+        if (Result != UR_RESULT_SUCCESS) {
+          throw sycl::exception(sycl::make_error_code(errc::runtime),
+                                "Failed to end native graph capture");
+        }
+        // CapturedGraph should be the same as MNativeGraphHandle
+      } else {
+        // Only call setCommandGraph for traditional recording
+        ValidQueue->setCommandGraph(nullptr);
+      }
     }
   }
 }
 
-bool graph_impl::empty() const { return MNodeStorage.empty(); }
+bool graph_impl::empty() const {
+
+  if (!MNativeGraphHandle) {
+    return MNodeStorage.empty();
+  }
+
+  bool IsEmptyResult = true;
+  if (getSyclObjImpl(MContext)
+          ->getAdapter()
+          .call_nocheck<UrApiKind::urGraphIsEmptyExp>(
+              MNativeGraphHandle, &IsEmptyResult) != UR_RESULT_SUCCESS) {
+    throw sycl::exception(sycl::make_error_code(errc::runtime),
+                          "Failed to check if graph is empty");
+  }
+  return IsEmptyResult;
+}
 
 bool graph_impl::checkForCycles() {
   std::list<node_impl *> SortedNodes;
@@ -691,20 +808,61 @@ std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents(
   return Events;
 }
 
-void graph_impl::beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue) {
+void graph_impl::beginRecordingImpl(sycl::detail::queue_impl &Queue,
+                                    bool AcquireQueueLock) {
   graph_impl::WriteLock Lock(MMutex);
+
+  // Native recording limitation: single queue at a time
+  if (MNativeGraphHandle && !MRecordingQueues.empty()) {
+    throw sycl::exception(make_error_code(errc::feature_not_supported),
+                          "Recording the same graph to multiple queues is not "
+                          "supported in native mode");
+  }
+
+  // Native recording limitation: in-order queues only
+  if (MNativeGraphHandle && !Queue.isInOrder()) {
+    throw sycl::exception(make_error_code(errc::feature_not_supported),
+                          "Native recording only works with in-order queues");
+  }
+
   if (!Queue.hasCommandGraph()) {
-    Queue.setCommandGraphUnlocked(shared_from_this());
+
+    // Use native UR graph recording if enabled
+    if (MNativeGraphHandle) {
+      auto UrQueue = Queue.getHandleRef();
+      context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+      sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+
+      if (Queue.isNativeRecording()) {
+        throw sycl::exception(sycl::make_error_code(errc::invalid),
+                              "Queue is already in native graph capture mode");
+      }
+
+      ur_result_t Result = Adapter.call_nocheck<
+          sycl::detail::UrApiKind::urQueueBeginCaptureIntoGraphExp>(
+          UrQueue, MNativeGraphHandle);
+      if (Result != UR_RESULT_SUCCESS) {
+        throw sycl::exception(sycl::make_error_code(errc::runtime),
+                              "Failed to begin native UR graph capture");
+      }
+    } else {
+      // Non-native recording path
+      if (AcquireQueueLock) {
+        Queue.setCommandGraph(shared_from_this());
+      } else {
+        Queue.setCommandGraphUnlocked(shared_from_this());
+      }
+    }
     addQueue(Queue);
   }
 }
 
+void graph_impl::beginRecordingUnlockedQueue(sycl::detail::queue_impl &Queue) {
+  beginRecordingImpl(Queue, /*AcquireQueueLock=*/false);
+}
+
 void graph_impl::beginRecording(sycl::detail::queue_impl &Queue) {
-  graph_impl::WriteLock Lock(MMutex);
-  if (!Queue.hasCommandGraph()) {
-    Queue.setCommandGraph(shared_from_this());
-    addQueue(Queue);
-  }
+  beginRecordingImpl(Queue, /*AcquireQueueLock=*/true);
 }
 
 // Check if nodes do not require enqueueing and if so loop back through
@@ -930,16 +1088,35 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,
                             "Device does not support Command Graph update");
     }
   }
-  // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
-  duplicateNodes();
 
-  if (auto PlaceholderQueuePtr = GraphImpl->getLastRecordedQueue()) {
-    MQueueImpl = std::move(PlaceholderQueuePtr);
+  // Create native UR executable graph if the modifiable graph uses native
+  // recording
+  if (isNativeRecordingEnabledForGraph(*GraphImpl)) {
+    context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+    sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+    ur_result_t Result =
+        Adapter
+            .call_nocheck<sycl::detail::UrApiKind::urGraphInstantiateGraphExp>(
+                GraphImpl->getNativeGraphHandle(),
+                &MNativeExecutableGraphHandle);
+    if (Result != UR_RESULT_SUCCESS) {
+      throw sycl::exception(sycl::make_error_code(errc::runtime),
+                            "Failed to instantiate native UR executable graph");
+    }
   } else {
-    MQueueImpl = sycl::detail::queue_impl::create(
-        *sycl::detail::getSyclObjImpl(GraphImpl->getDevice()),
-        *sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
-        sycl::property_list{});
+    // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
+    duplicateNodes();
+
+    // A placeholder queue is only required for enqueueNode and update
+    // operations which are only possible with the command buffer path.
+    if (auto PlaceholderQueuePtr = GraphImpl->getLastRecordedQueue()) {
+      MQueueImpl = std::move(PlaceholderQueuePtr);
+    } else {
+      MQueueImpl = sycl::detail::queue_impl::create(
+          *sycl::detail::getSyclObjImpl(GraphImpl->getDevice()),
+          *sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
+          sycl::property_list{});
+    }
   }
 }
 
@@ -951,6 +1128,16 @@ exec_graph_impl::~exec_graph_impl() {
         sycl::detail::getSyclObjImpl(MContext)->getAdapter();
     MSchedule.clear();
 
+    // Clean up native UR executable graph if it was created
+    if (MNativeExecutableGraphHandle) {
+      ur_result_t Res = Adapter.call_nocheck<
+          sycl::detail::UrApiKind::urGraphExecutableGraphDestroyExp>(
+          MNativeExecutableGraphHandle);
+      if (Res == UR_RESULT_SUCCESS) {
+        MNativeExecutableGraphHandle = nullptr;
+      }
+    }
+
     // Clean up any graph-owned allocations that were allocated
     MGraphImpl->getMemPool().deallocateAndUnmapAll();
 
@@ -1205,12 +1392,60 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
   return SignalEvent;
 }
 
+EventImplPtr
+exec_graph_impl::enqueueNative(sycl::detail::queue_impl &Queue,
+                               sycl::detail::CG::StorageInitHelper CGData,
+                               bool EventNeeded) {
+  // Create a list containing all the UR event handles in WaitEvents.
+  // WaitEvents is assumed to be safe for scheduler bypass and any
+  // host-task events that it contains can be ignored.
+  auto &WaitEvents = CGData.MEvents;
+  std::vector<ur_event_handle_t> UrEventHandles{};
+  UrEventHandles.reserve(WaitEvents.size());
+  for (auto &SyclWaitEvent : WaitEvents) {
+    if (auto URHandle = SyclWaitEvent->getHandle()) {
+      UrEventHandles.push_back(URHandle);
+    }
+  }
+
+  const size_t UrEnqueueWaitListSize = UrEventHandles.size();
+  ur_event_handle_t *UrEnqueueWaitList =
+      UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data();
+  EventImplPtr NewEvent = nullptr;
+  if (!EventNeeded) {
+    Queue.getAdapter().call<sycl::detail::UrApiKind::urEnqueueGraphExp>(
+        Queue.getHandleRef(), MNativeExecutableGraphHandle,
+        UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr);
+  } else {
+    NewEvent = sycl::detail::event_impl::create_device_event(Queue);
+    NewEvent->setContextImpl(Queue.getContextImpl());
+    NewEvent->setStateIncomplete();
+    NewEvent->setSubmissionTime();
+    ur_event_handle_t UrEvent = nullptr;
+    Queue.getAdapter().call<sycl::detail::UrApiKind::urEnqueueGraphExp>(
+        Queue.getHandleRef(), MNativeExecutableGraphHandle,
+        UrEnqueueWaitListSize, UrEnqueueWaitList, &UrEvent);
+    NewEvent->setHandle(UrEvent);
+    NewEvent->setEventFromSubmittedExecCommandBuffer(true);
+    if (MEnableProfiling) {
+      NewEvent->setProfilingEnabled(MEnableProfiling);
+    }
+  }
+  return NewEvent;
+}
+
 std::pair<EventImplPtr, bool>
 exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
                          sycl::detail::CG::StorageInitHelper CGData,
                          bool EventNeeded) {
   WriteLock Lock(MMutex);
+  // Use native recording path if available
+  if (MNativeExecutableGraphHandle) {
+    return {enqueueNative(Queue, std::move(CGData), EventNeeded),
+            /*SkipScheduler=*/true};
+  }
 
+  // Command buffer path
   cleanupExecutionEvents(MSchedulerDependencies);
   CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
                         MSchedulerDependencies.end());
@@ -1412,6 +1647,11 @@ void exec_graph_impl::duplicateNodes() {
 }
 
 void exec_graph_impl::update(std::shared_ptr<graph_impl> GraphImpl) {
+  if (MNativeExecutableGraphHandle) {
+    throw sycl::exception(
+        sycl::make_error_code(errc::feature_not_supported),
+        "Graph update is not supported in native recording mode");
+  }
 
   if (MDevice != GraphImpl->getDevice()) {
     throw sycl::exception(
@@ -1484,6 +1724,11 @@ void exec_graph_impl::update(node_impl &Node) {
 }
 
 void exec_graph_impl::update(nodes_range Nodes) {
+  if (MNativeExecutableGraphHandle) {
+    throw sycl::exception(
+        sycl::make_error_code(errc::feature_not_supported),
+        "Graph update is not supported in native recording mode");
+  }
   if (!MIsUpdatable) {
     throw sycl::exception(sycl::make_error_code(errc::invalid),
                           "update() cannot be called on a executable graph "
@@ -1974,12 +2219,47 @@ void modifiable_command_graph::end_recording() {
 
 void modifiable_command_graph::end_recording(queue &RecordingQueue) {
   queue_impl &QueueImpl = *sycl::detail::getSyclObjImpl(RecordingQueue);
-  if (QueueImpl.getCommandGraph() == impl) {
-    QueueImpl.setCommandGraph(nullptr);
+
+  // Check if this queue is recording to this graph
+  bool IsRecordingToThisGraph = false;
+
+  if (isNativeRecordingEnabledForGraph(*impl)) {
+    // For native recording, check if queue is in our recording queue list
     graph_impl::WriteLock Lock(impl->MMutex);
-    impl->removeQueue(QueueImpl);
+    IsRecordingToThisGraph = impl->isQueueRecording(QueueImpl);
+
+    if (IsRecordingToThisGraph) {
+      // End native UR graph capture
+      assert(impl->getNativeGraphHandle() &&
+             "Native graph handle must be valid when ending native recording");
+      auto UrQueue = QueueImpl.getHandleRef();
+      ur_exp_graph_handle_t CapturedGraph = nullptr;
+      context_impl &ContextImpl =
+          *sycl::detail::getSyclObjImpl(impl->getContext());
+      sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+      ur_result_t Result =
+          Adapter
+              .call_nocheck<sycl::detail::UrApiKind::urQueueEndGraphCaptureExp>(
+                  UrQueue, &CapturedGraph);
+      if (Result != UR_RESULT_SUCCESS) {
+        throw sycl::exception(sycl::make_error_code(errc::runtime),
+                              "Failed to end native UR graph capture");
+      }
+      assert(CapturedGraph == impl->getNativeGraphHandle() &&
+             "Captured graph handle must match the graph's native handle");
+      impl->removeQueue(QueueImpl);
+    }
+  } else {
+    // Traditional recording path
+    if (QueueImpl.getCommandGraph() == impl) {
+      QueueImpl.setCommandGraph(nullptr);
+      graph_impl::WriteLock Lock(impl->MMutex);
+      impl->removeQueue(QueueImpl);
+      IsRecordingToThisGraph = true;
+    }
   }
-  if (QueueImpl.hasCommandGraph())
+
+  if (QueueImpl.hasCommandGraph() && !IsRecordingToThisGraph)
     throw sycl::exception(sycl::make_error_code(errc::invalid),
                           "end_recording called for a queue which is recording "
                           "to a different graph.");
@@ -2007,10 +2287,22 @@ void modifiable_command_graph::print_graph(sycl::detail::string_view pathstr,
 
 std::vector<node> modifiable_command_graph::get_nodes() const {
   graph_impl::ReadLock Lock(impl->MMutex);
+  if (impl->getNativeGraphHandle()) {
+    throw sycl::exception(
+        sycl::make_error_code(errc::invalid),
+        "get_nodes() is not supported for graphs created with the "
+        "enable_native_recording property.");
+  }
   return impl->nodes().to<std::vector<node>>();
 }
 std::vector<node> modifiable_command_graph::get_root_nodes() const {
   graph_impl::ReadLock Lock(impl->MMutex);
+  if (impl->getNativeGraphHandle()) {
+    throw sycl::exception(
+        sycl::make_error_code(errc::invalid),
+        "get_root_nodes() is not supported for graphs created with the "
+        "enable_native_recording property.");
+  }
   return impl->roots().to<std::vector<node>>();
 }
 
@@ -2057,18 +2349,22 @@ executable_command_graph::executable_command_graph(
 }
 
 void executable_command_graph::finalizeImpl() {
-  impl->makePartitions();
-
-  // Handle any work required for graph-owned memory allocations
-  impl->finalizeMemoryAllocations();
-
-  auto Device = impl->getGraphImpl()->getDevice();
-  for (auto Partition : impl->getPartitions()) {
-    if (!Partition->MIsHostTask) {
-      impl->createCommandBuffers(Device, Partition);
+  // Partitions and command buffers are not used for native recording and
+  // instantiation is fully performed in the exec_graph_impl constructor.
+  if (!impl->getNativeExecutableGraphHandle()) {
+    impl->makePartitions();
+
+    // Handle any work required for graph-owned memory allocations
+    impl->finalizeMemoryAllocations();
+
+    auto Device = impl->getGraphImpl()->getDevice();
+    for (auto Partition : impl->getPartitions()) {
+      if (!Partition->MIsHostTask) {
+        impl->createCommandBuffers(Device, Partition);
+      }
     }
+    impl->buildRequirements();
   }
-  impl->buildRequirements();
 }
 
 void executable_command_graph::update(
diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp
index a6c0110bd4ff4..8151285368fe1 100644
--- a/sycl/source/detail/graph/graph_impl.hpp
+++ b/sycl/source/detail/graph/graph_impl.hpp
@@ -23,6 +23,12 @@
 #include <shared_mutex> // for shared_mutex
 #include <vector>       // for vector
 
+// Forward declare UR types
+struct ur_exp_graph_handle_t_;
+using ur_exp_graph_handle_t = ur_exp_graph_handle_t_ *;
+struct ur_exp_executable_graph_handle_t_;
+using ur_exp_executable_graph_handle_t = ur_exp_executable_graph_handle_t_ *;
+
 // For testing of graph internals
 class GraphImplTest;
 
@@ -308,21 +314,34 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
 
   /// Prints the contents of the graph to a text file in DOT format.
   /// @param FilePath Path to the output file.
-  /// @param Verbose If true, print additional information about the nodes such
-  /// as kernel args or memory access where applicable.
+  /// @param Verbose If true (and native recording is not enabled), print
+  /// additional information about the nodes such as kernel args or memory
+  /// access where applicable.
   void printGraphAsDot(const std::string FilePath, bool Verbose) const {
-    /// Vector of nodes visited during the graph printing
-    std::vector<node_impl *> VisitedNodes;
+    if (MNativeGraphHandle) {
+      context_impl &ContextImpl = *sycl::detail::getSyclObjImpl(MContext);
+      sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter();
+      ur_result_t Result =
+          Adapter.call_nocheck<sycl::detail::UrApiKind::urGraphDumpContentsExp>(
+              MNativeGraphHandle, FilePath.c_str());
+      if (Result != UR_RESULT_SUCCESS) {
+        throw sycl::exception(sycl::make_error_code(errc::runtime),
+                              "Failed to dump native UR graph contents");
+      }
+    } else {
+      /// Vector of nodes visited during the graph printing
+      std::vector<node_impl *> VisitedNodes;
 
-    std::fstream Stream(FilePath, std::ios::out);
-    Stream << "digraph dot {" << std::endl;
+      std::fstream Stream(FilePath, std::ios::out);
+      Stream << "digraph dot {" << std::endl;
 
-    for (node_impl &Node : roots())
-      Node.printDotRecursive(Stream, VisitedNodes, Verbose);
+      for (node_impl &Node : roots())
+        Node.printDotRecursive(Stream, VisitedNodes, Verbose);
 
-    Stream << "}" << std::endl;
+      Stream << "}" << std::endl;
 
-    Stream.close();
+      Stream.close();
+    }
   }
 
   /// Make an edge between two nodes in the graph. Performs some mandatory
@@ -519,7 +538,26 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
     }
   }
 
+  /// Get the native UR graph handle for this graph.
+  /// @return Native UR graph handle, or nullptr if native recording is not
+  /// enabled.
+  ur_exp_graph_handle_t getNativeGraphHandle() const {
+    return MNativeGraphHandle;
+  }
+
+  /// Check if a queue is currently recording to this graph.
+  /// @param Queue The queue to check.
+  /// @return True if the queue is recording to this graph, false otherwise.
+  bool isQueueRecording(sycl::detail::queue_impl &Queue);
+
 private:
+  /// Common implementation for beginRecording and beginRecordingUnlockedQueue.
+  /// @param[in] Queue The queue to be recorded from.
+  /// @param[in] AcquireQueueLock Whether to acquire the queue lock when setting
+  /// command graph.
+  void beginRecordingImpl(sycl::detail::queue_impl &Queue,
+                          bool AcquireQueueLock);
+
   template <typename... Ts> node_impl &createNode(Ts &&...Args) {
     MNodeStorage.push_back(
         std::make_shared<node_impl>(std::forward<Ts>(Args)...));
@@ -582,6 +620,14 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
   /// presence of the assume_buffer_outlives_graph property.
   bool MAllowBuffers = false;
 
+  /// Native UR graph handle used for native recording mode.
+  ///
+  /// This handle is non-null only when native recording is enabled via the
+  /// enable_native_recording property.
+  ///
+  /// @note Native recording requires immediate command lists.
+  ur_exp_graph_handle_t MNativeGraphHandle = nullptr;
+
   /// Mapping from queues to barrier nodes. For each queue the last barrier
   /// node recorded to the graph from the queue is stored.
   std::map<std::weak_ptr<sycl::detail::queue_impl>, node_impl *,
@@ -600,6 +646,13 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
   std::atomic<size_t> MExecGraphCount = 0;
 };
 
+/// Get whether native recording is enabled for this graph.
+/// @param graph The graph_impl to check.
+/// @return True if native recording is enabled, false otherwise.
+inline bool isNativeRecordingEnabledForGraph(graph_impl const &graph) {
+  return graph.getNativeGraphHandle() != nullptr;
+}
+
 /// Class representing the implementation of command_graph<executable>.
 class exec_graph_impl {
 public:
@@ -647,6 +700,16 @@ class exec_graph_impl {
   enqueue(sycl::detail::queue_impl &Queue,
           sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
 
+  /// Enqueue a native UR graph (used when native recording is enabled).
+  /// @param Queue Command-queue to schedule execution on.
+  /// @param CGData Command-group data for waitlist event dependencies.
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return Returns an event if requested and nullptr otherwise.
+  EventImplPtr enqueueNative(sycl::detail::queue_impl &Queue,
+                             sycl::detail::CG::StorageInitHelper CGData,
+                             bool EventNeeded);
+
   /// Iterates through all the nodes in the graph to build the list of
   /// accessor requirements for the whole graph and for each partition.
   void buildRequirements();
@@ -675,6 +738,13 @@ class exec_graph_impl {
   /// @return pointer to the graph_impl MGraphImpl
   const std::shared_ptr<graph_impl> &getGraphImpl() const { return MGraphImpl; }
 
+  /// Query the native executable graph handle.
+  /// @return Native UR executable graph handle, or nullptr if not using native
+  /// recording.
+  ur_exp_executable_graph_handle_t getNativeExecutableGraphHandle() const {
+    return MNativeExecutableGraphHandle;
+  }
+
   /// Query the vector of the partitions composing the exec_graph.
   /// @return Vector of partitions in execution order.
   const std::vector<std::shared_ptr<partition>> &getPartitions() const {
@@ -947,6 +1017,11 @@ class exec_graph_impl {
   /// If true, the graph profiling is enabled.
   bool MEnableProfiling;
 
+  /// Native UR executable graph handle for native recording mode
+  /// Only valid when the original modifiable graph was created with native
+  /// recording enabled
+  ur_exp_executable_graph_handle_t MNativeExecutableGraphHandle = nullptr;
+
   // Stores a cache of node ids from modifiable graph nodes to the companion
   // node(s) in this graph. Used for quick access when updating this graph.
   std::multimap<node_impl::id_type, node_impl *> MIDCache;
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index b669a0fa63400..5719496c5ac33 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -531,6 +531,24 @@ EventImplPtr queue_impl::submit_barrier_direct_impl(
   return ResEvent;
 }
 
+bool queue_impl::isNativeRecording() const {
+  bool IsGraphCaptureEnabled = false;
+  ur_result_t Result =
+      getAdapter().call_nocheck<UrApiKind::urQueueIsGraphCaptureEnabledExp>(
+          MQueue, &IsGraphCaptureEnabled);
+  return Result == UR_RESULT_SUCCESS && IsGraphCaptureEnabled;
+}
+
+ext::oneapi::experimental::queue_state
+queue_impl::ext_oneapi_get_state_impl() const {
+  // A graph may either be recording at the SYCL level or recording at a lower
+  // level API (e.g. UR)
+  if (hasCommandGraph() || isNativeRecording()) {
+    return ext::oneapi::experimental::queue_state::recording;
+  }
+  return ext::oneapi::experimental::queue_state::executing;
+}
+
 EventImplPtr queue_impl::submit_command_to_graph(
     ext::oneapi::experimental::detail::graph_impl &GraphImpl,
     std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 72844e7e796ef..cfea2e3b933c0 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -646,6 +646,10 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
 
   bool hasCommandGraph() const { return !MGraph.expired(); }
 
+  bool isNativeRecording() const;
+
+  ext::oneapi::experimental::queue_state ext_oneapi_get_state_impl() const;
+
   EventImplPtr submit_command_to_graph(
       ext::oneapi::experimental::detail::graph_impl &GraphImpl,
       std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index 9a3464f887d15..9033d8640f0d9 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -80,7 +80,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_VIRTUAL_MEM 1
 #define SYCL_EXT_ONEAPI_USM_MALLOC_PROPERTIES 1
 #define SYCL_EXT_INTEL_CACHE_CONFIG 1
-#define SYCL_EXT_ONEAPI_GRAPH 1
+#define SYCL_EXT_ONEAPI_GRAPH 2
 #define SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY 1
 #define SYCL_EXT_ONEAPI_DEVICE_GLOBAL 1
 #define SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST 1
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 434b2be35b1fe..868ec56f3c888 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -752,9 +752,24 @@ detail::EventImplPtr handler::finalize() {
     return EventImpl;
   }
 
-  // Because graph case is handled right above.
+  // Because command graph case is handled right above.
   assert(Queue);
 
+  // Native graph recording limitation
+  if (Queue->isNativeRecording()) {
+    if (type == detail::CGType::CodeplayHostTask) {
+      throw sycl::exception(
+          make_error_code(errc::feature_not_supported),
+          "SYCL host_task is not supported in native recording mode. Use "
+          "zeCommandListAppendHostFunction as a workaround.");
+    }
+    if (!CommandGroup->getRequirements().empty()) {
+      throw sycl::exception(
+          make_error_code(errc::feature_not_supported),
+          "sycl::buffer accessors are not supported in native recording mode.");
+    }
+  }
+
   // If the queue has an associated graph then we need to take the CG and pass
   // it to the graph to create a node, rather than submit it to the scheduler.
   if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) {
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 11271eaba796c..355e8fcb4c02b 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -76,9 +76,7 @@ context queue::get_context() const { return impl->get_context(); }
 device queue::get_device() const { return impl->get_device(); }
 
 ext::oneapi::experimental::queue_state queue::ext_oneapi_get_state() const {
-  return impl->hasCommandGraph()
-             ? ext::oneapi::experimental::queue_state::recording
-             : ext::oneapi::experimental::queue_state::executing;
+  return impl->ext_oneapi_get_state_impl();
 }
 
 ext::oneapi::experimental::command_graph<
diff --git a/sycl/test-e2e/Graph/Explicit/empty.cpp b/sycl/test-e2e/Graph/Explicit/empty.cpp
new file mode 100644
index 0000000000000..2a76632891965
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/empty.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/empty.cpp"
diff --git a/sycl/test-e2e/Graph/Inputs/empty.cpp b/sycl/test-e2e/Graph/Inputs/empty.cpp
new file mode 100644
index 0000000000000..f97c292d827a6
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/empty.cpp
@@ -0,0 +1,28 @@
+// Tests the empty() method on modifiable command graphs.
+
+#include "../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{{property::queue::in_order{}}};
+
+#ifdef GRAPH_E2E_NATIVE_RECORDING
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+#else
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+#endif
+
+  // Test that empty() returns true when graph has 0 nodes
+  assert(Graph.empty() && "Graph should be empty with 0 nodes");
+
+  add_node(Graph, Queue, [&](handler &CGH) { CGH.single_task([=]() {}); });
+
+  // Test that empty() returns false when graph has 1 node
+  assert(!Graph.empty() && "Graph should not be empty with 1 node");
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp
new file mode 100644
index 0000000000000..4e42bf307d18a
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/work_group_scratch_memory.cpp
@@ -0,0 +1,79 @@
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/properties/all_properties.hpp>
+#include <sycl/usm.hpp>
+
+using DataType = int;
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+
+void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) {
+  // one work-group copies data to shared memory from A
+  // And then puts in back into B
+
+  DataType *smem_ptr =
+      reinterpret_cast<DataType *>(sycl_ext::get_work_group_scratch_memory());
+  auto threadIdx_x = it.get_local_linear_id();
+
+  smem_ptr[threadIdx_x] = a[threadIdx_x];
+  sycl::group_barrier(it.get_group());
+
+  b[threadIdx_x] = smem_ptr[threadIdx_x];
+}
+
+template <typename T> struct KernelFunctor {
+  T m_props;
+  DataType *m_a;
+  DataType *m_b;
+  KernelFunctor(T props, DataType *a, DataType *b)
+      : m_props(props), m_a(a), m_b(b) {}
+
+  void operator()(sycl::nd_item<1> it) const { copy_via_smem(m_a, m_b, it); }
+  auto get(sycl_ext::properties_tag) const { return m_props; }
+};
+
+int main() {
+  sycl::queue queue{sycl::property::queue::in_order()};
+#ifdef GRAPH_E2E_NATIVE_RECORDING
+  sycl::ext::oneapi::experimental::command_graph graph(
+      queue.get_device(),
+      {sycl_ext::property::graph::enable_native_recording{}});
+#else
+  sycl::ext::oneapi::experimental::command_graph graph(queue.get_device());
+#endif
+
+  auto size = std::min(
+      queue.get_device().get_info<sycl::info::device::max_work_group_size>(),
+      size_t{1024});
+
+  DataType *a = sycl::malloc_device<DataType>(size, queue);
+  DataType *b = sycl::malloc_device<DataType>(size, queue);
+  std::vector<DataType> a_host(size, 1.0);
+  std::vector<DataType> b_host(size, -5.0);
+
+  graph.begin_recording(queue);
+  queue.copy(a_host.data(), a, size);
+
+  queue.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(
+        sycl::nd_range<1>({size}, {size}),
+        KernelFunctor(sycl_ext::properties{sycl_ext::work_group_scratch_size(
+                          size * sizeof(DataType))},
+                      a, b));
+  });
+
+  queue.copy(b, b_host.data(), size);
+  graph.end_recording();
+  auto exec_graph = graph.finalize();
+
+  queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_graph(exec_graph); })
+      .wait_and_throw();
+
+  for (size_t i = 0; i < b_host.size(); i++) {
+    assert(b_host[i] == a_host[i]);
+  }
+  sycl::free(a, queue);
+  sycl::free(b, queue);
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp
new file mode 100644
index 0000000000000..e72250382fbc3
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/basic.cpp
@@ -0,0 +1,67 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test for enable_native_recording property using queue recording mode
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  // Create a queue with immediate command list property for native recording
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+  int *Data = malloc_device<int>(N, Queue);
+
+  QueueStateVerifier verifier(Queue);
+  verifier.verify(EXECUTING);
+
+  // Use queue recording mode to create the graph
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  // Record initialization kernel
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> idx) { Data[idx] = static_cast<int>(idx); });
+  });
+
+  // Record computation kernel
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> idx) { Data[idx] = Data[idx] * 2; });
+  });
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  // Finalize and execute the graph
+  auto ExecutableGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+
+  Queue.wait();
+
+  // Verify results
+  std::vector<int> HostData(N);
+  Queue.memcpy(HostData.data(), Data, N * sizeof(int)).wait();
+
+  for (size_t i = 0; i < N; i++) {
+    int Expected = static_cast<int>(i) * 2;
+    assert(check_value(i, Expected, HostData[i], "HostData"));
+  }
+
+  free(Data, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp
new file mode 100644
index 0000000000000..08eca94d52703
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/debug_print_graph.cpp
@@ -0,0 +1,52 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out ; FileCheck %s --implicit-check-not=LEAK --input-file graph_native.dot %}
+
+// Test for native recording with debug print graph functionality. SYCL graph
+// does not control the output format, so we only verify high level details of
+// the output and rely on validation tests in L0 graph
+
+// CHECK: digraph
+// CHECK: zeCommandListAppendMemoryFill
+// CHECK: MyKernel
+// CHECK: zeCommandListAppendMemoryCopy
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+class MyKernel;
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 128;
+
+  int *DeviceData = malloc_device<int>(N, Queue);
+  int *DeviceTemp = malloc_device<int>(N, Queue);
+
+  Graph.begin_recording(Queue);
+
+  exp_ext::memset(Queue, DeviceData, 0x42, N * sizeof(int));
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for<MyKernel>(
+        range<1>{N}, [=](id<1> idx) { DeviceData[idx] = DeviceData[idx] + 1; });
+  });
+  exp_ext::memcpy(Queue, DeviceTemp, DeviceData, N * sizeof(int));
+
+  Graph.end_recording(Queue);
+
+  Graph.print_graph("graph_native.dot");
+
+  free(DeviceData, Queue);
+  free(DeviceTemp, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp
new file mode 100644
index 0000000000000..e31efe8a06118
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/empty.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+#define GRAPH_E2E_NATIVE_RECORDING
+
+#include "../../Inputs/empty.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp
new file mode 100644
index 0000000000000..790d0e048fb23
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_async_alloc.cpp
@@ -0,0 +1,61 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  constexpr size_t N = 1024;
+
+  Graph.begin_recording(Queue);
+
+  // Test 1: async_malloc should throw during recording
+  void *AsyncPtr = nullptr;
+  if (!expectException(
+          [&]() {
+            Queue.submit([&](handler &CGH) {
+              AsyncPtr = exp_ext::async_malloc(CGH, usm::alloc::device,
+                                               N * sizeof(int));
+            });
+          },
+          "async_malloc during native recording", sycl::errc::invalid)) {
+    Graph.end_recording();
+    return 1;
+  }
+
+  // Test 2: async_free should also throw during recording
+  // First allocate memory outside of recording
+  Graph.end_recording();
+  int *PreAllocatedPtr = malloc_device<int>(N, Queue);
+  Graph.begin_recording(Queue);
+
+  if (!expectException(
+          [&]() {
+            Queue.submit([&](handler &CGH) {
+              exp_ext::async_free(CGH, PreAllocatedPtr);
+            });
+          },
+          "async_free during native recording", sycl::errc::invalid)) {
+    Graph.end_recording();
+    free(PreAllocatedPtr, Queue);
+    return 1;
+  }
+
+  Graph.end_recording();
+  free(PreAllocatedPtr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp
new file mode 100644
index 0000000000000..1e253ca136f89
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_buffer_accessor.cpp
@@ -0,0 +1,47 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  constexpr size_t N = 1024;
+  sycl::buffer<int> Buf{N};
+
+  // Use assume_buffer_outlives_graph so the graph-level buffer check doesn't
+  // fire before the native-recording check in the handler.
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{},
+       exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+  Graph.begin_recording(Queue);
+
+  // Submitting a kernel that uses a buffer accessor should throw
+  // errc::feature_not_supported in native recording mode.
+  if (!expectException(
+          [&]() {
+            Queue.submit([&](handler &CGH) {
+              auto Acc = Buf.get_access<sycl::access::mode::write>(CGH);
+              CGH.parallel_for(sycl::range<1>{N},
+                               [=](sycl::id<1> Idx) { Acc[Idx] = 0; });
+            });
+          },
+          "buffer accessor in native recording mode",
+          sycl::errc::feature_not_supported)) {
+    Graph.end_recording();
+    return 1;
+  }
+
+  Graph.end_recording();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp
new file mode 100644
index 0000000000000..5f7ba8452ff8a
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_explicit_api.cpp
@@ -0,0 +1,61 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+  int *Data = malloc_device<int>(N, Queue);
+
+  // Test 1: Graph.add() should throw when native recording is enabled
+  if (!expectException(
+          [&]() {
+            auto node = Graph.add([&](handler &CGH) {
+              CGH.parallel_for(range<1>{N},
+                               [=](id<1> idx) { Data[idx] = idx; });
+            });
+          },
+          "Graph.add() with native recording enabled",
+          sycl::errc::feature_not_supported)) {
+    free(Data, Queue);
+    return 1;
+  }
+
+  // Test 2: Try using Graph.add() with dependencies - should also throw
+  if (!expectException(
+          [&]() {
+            auto node1 = Graph.add([&](handler &CGH) {
+              CGH.parallel_for(range<1>{N},
+                               [=](id<1> idx) { Data[idx] = idx; });
+            });
+
+            auto node2 = Graph.add(
+                [&](handler &CGH) {
+                  CGH.parallel_for(range<1>{N},
+                                   [=](id<1> idx) { Data[idx] *= 2; });
+                },
+                {exp_ext::property::node::depends_on{node1}});
+          },
+          "Graph.add() with dependencies",
+          sycl::errc::feature_not_supported)) {
+    free(Data, Queue);
+    return 1;
+  }
+
+  free(Data, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp
new file mode 100644
index 0000000000000..2d9beb56c1700
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_get_nodes.cpp
@@ -0,0 +1,35 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  // get_nodes() should throw errc::invalid for native recording graphs
+  if (!expectException([&]() { Graph.get_nodes(); },
+                       "get_nodes() with enable_native_recording",
+                       sycl::errc::invalid)) {
+    return 1;
+  }
+
+  // get_root_nodes() should throw errc::invalid for native recording graphs
+  if (!expectException([&]() { Graph.get_root_nodes(); },
+                       "get_root_nodes() with enable_native_recording",
+                       sycl::errc::invalid)) {
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp
new file mode 100644
index 0000000000000..e009df3cde814
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_host_task.cpp
@@ -0,0 +1,48 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  constexpr size_t N = 1024;
+  int *Data = malloc_shared<int>(N, Queue);
+
+  Graph.begin_recording(Queue);
+
+  // Try to record a SYCL host_task - this should throw an exception
+  if (!expectException(
+          [&]() {
+            Queue.submit([&](handler &CGH) {
+              CGH.host_task([=]() {
+                // This host task should not execute in native recording mode
+                for (size_t i = 0; i < N; i++) {
+                  Data[i] = i + 100;
+                }
+              });
+            });
+          },
+          "host_task in native recording",
+          sycl::errc::feature_not_supported)) {
+    Graph.end_recording();
+    free(Data, Queue);
+    return 1;
+  }
+
+  Graph.end_recording();
+  free(Data, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp
new file mode 100644
index 0000000000000..5c433f49cd2b0
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_multi_queue_recording.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  device Dev;
+  context Ctx{Dev};
+
+  // Create two in-order queues sharing the same device and context
+  queue Queue1{Ctx, Dev, {property::queue::in_order{}}};
+  queue Queue2{Ctx, Dev, {property::queue::in_order{}}};
+
+  exp_ext::command_graph Graph{
+      Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}};
+
+  constexpr size_t N = 1024;
+  int *Data = malloc_device<int>(N, Dev, Ctx);
+
+  Graph.begin_recording(Queue1);
+
+  Queue1.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Data[idx] = idx; });
+  });
+
+  // Try to start recording on Queue2 while Queue1 is still recording
+  const bool passed = expectException([&]() { Graph.begin_recording(Queue2); },
+                                      "begin_recording on second queue",
+                                      sycl::errc::feature_not_supported);
+
+  assert(Queue1.ext_oneapi_get_state() == exp_ext::queue_state::recording);
+  assert(Queue2.ext_oneapi_get_state() == exp_ext::queue_state::executing);
+
+  Graph.end_recording(Queue1);
+  free(Data, Ctx);
+
+  if (!passed) {
+    std::cerr << "Expected a thrown exception when starting recording twice"
+              << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp
new file mode 100644
index 0000000000000..b006446283f48
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_out_of_order_queue.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  device Dev;
+  context Ctx{Dev};
+  queue OutOfOrderQueue{Ctx, Dev};
+  exp_ext::command_graph Graph{
+      Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}};
+
+  if (!expectException([&]() { Graph.begin_recording(OutOfOrderQueue); },
+                       "begin_recording with out-of-order queue",
+                       sycl::errc::feature_not_supported)) {
+    std::cerr << "Out-of-order queue should throw exception" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp
new file mode 100644
index 0000000000000..0b0392c5fc8e1
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/exception_update.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test that update() throws when called on a graph in native recording mode
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+  int *Data = malloc_device<int>(N, Queue);
+
+  Graph.begin_recording(Queue);
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Data[idx] = idx; });
+  });
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize({exp_ext::property::graph::updatable{}});
+
+  if (!expectException([&]() { ExecGraph.update(Graph); },
+                       "update(graph) with native recording enabled",
+                       sycl::errc::feature_not_supported)) {
+    free(Data, Queue);
+    return 1;
+  }
+
+  if (!expectException(
+          [&]() { ExecGraph.update(std::vector<exp_ext::node>{}); },
+          "update(nodes) with native recording enabled",
+          sycl::errc::feature_not_supported)) {
+    free(Data, Queue);
+    return 1;
+  }
+
+  free(Data, Queue);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp
new file mode 100644
index 0000000000000..4c75294b4aea3
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/indirect_update.cpp
@@ -0,0 +1,125 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// A pointer to a structure may be utilized to hold mutable kernel arguments
+// without using mutable command lists as the pointer itself remains constant.
+// This test demonstrates updating input/output pointers and scalar arguments
+// between graph submissions with a single wait at the end.
+
+#include "../../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+struct MutableArguments {
+  int *InputPtr;
+  int *OutputPtr;
+  int Multiplier;
+  int Addend;
+};
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+
+  int *Input1 = malloc_device<int>(N, Queue);
+  int *Input2 = malloc_device<int>(N, Queue);
+  int *Output1 = malloc_device<int>(N, Queue);
+  int *Output2 = malloc_device<int>(N, Queue);
+
+  MutableArguments *DeviceArgs = malloc_device<MutableArguments>(1, Queue);
+
+  std::vector<int> HostInput1(N);
+  for (size_t i = 0; i < N; i++) {
+    HostInput1[i] = static_cast<int>(i);
+  }
+  Queue.memcpy(Input1, HostInput1.data(), N * sizeof(int)).wait();
+
+  std::vector<int> HostInput2(N);
+  for (size_t i = 0; i < N; i++) {
+    HostInput2[i] = static_cast<int>(i * 2);
+  }
+  Queue.memcpy(Input2, HostInput2.data(), N * sizeof(int)).wait();
+
+  QueueStateVerifier verifier(Queue);
+  verifier.verify(EXECUTING);
+
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      int input = DeviceArgs->InputPtr[idx];
+      int multiplier = DeviceArgs->Multiplier;
+      int addend = DeviceArgs->Addend;
+      DeviceArgs->OutputPtr[idx] = input * multiplier + addend;
+    });
+  });
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  auto ExecutableGraph = Graph.finalize();
+
+  MutableArguments HostArgs1;
+  HostArgs1.InputPtr = Input1;
+  HostArgs1.OutputPtr = Output1;
+  HostArgs1.Multiplier = 2;
+  HostArgs1.Addend = 10;
+
+  MutableArguments HostArgs2;
+  HostArgs2.InputPtr = Input2;
+  HostArgs2.OutputPtr = Output2;
+  HostArgs2.Multiplier = 3;
+  HostArgs2.Addend = 20;
+
+  // Submit graph with first set of arguments (Input1 -> Output1)
+  Queue.memcpy(DeviceArgs, &HostArgs1, sizeof(MutableArguments)).wait();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+
+  // Update the device arguments via memcpy
+  Queue.memcpy(DeviceArgs, &HostArgs2, sizeof(MutableArguments));
+
+  // Submit graph again with second set of arguments (Input2 -> Output2)
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+
+  // Single wait for both graph submissions
+  Queue.wait();
+
+  // Read back both output buffers
+  std::vector<int> HostOutput1(N);
+  std::vector<int> HostOutput2(N);
+  Queue.memcpy(HostOutput1.data(), Output1, N * sizeof(int)).wait();
+  Queue.memcpy(HostOutput2.data(), Output2, N * sizeof(int)).wait();
+
+  // Verify first execution results:
+  // Output1[i] = Input1[i] * 2 + 10 = i * 2 + 10
+  for (size_t i = 0; i < N; i++) {
+    int Expected = static_cast<int>(i) * 2 + 10;
+    assert(check_value(i, Expected, HostOutput1[i], "Output1"));
+  }
+
+  // Verify second execution results:
+  // Output2[i] = Input2[i] * 3 + 20 = (i * 2) * 3 + 20
+  for (size_t i = 0; i < N; i++) {
+    int Expected = static_cast<int>(i) * 6 + 20;
+    assert(check_value(i, Expected, HostOutput2[i], "Output2"));
+  }
+
+  free(Input1, Queue);
+  free(Input2, Queue);
+  free(Output1, Queue);
+  free(Output2, Queue);
+  free(DeviceArgs, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp
new file mode 100644
index 0000000000000..1afebb84f8066
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/memory_ops.cpp
@@ -0,0 +1,74 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test for native recording with non-kernel memory operations using handlerless
+// APIs: memcpy, memset, fill, copy
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 128;
+
+  // Allocate two buffers
+  int *Data1 = malloc_device<int>(2 * N, Queue);
+  int *Data2 = malloc_device<int>(2 * N, Queue);
+
+  // Host buffers for verification
+  std::vector<int> HostData1(N);
+  std::vector<int> HostData2(N);
+
+  QueueStateVerifier verifier(Queue);
+  verifier.verify(EXECUTING);
+
+  // Use queue recording mode to create the graph
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  // Test memory operations on two buffers:
+  // Data1: memset first half (byte-wise to 0x2A), D2D memcpy to second half and
+  // D2H memcpy for verification Data2: fill first half, D2D copy to second half
+  // and D2H copy for verification
+
+  exp_ext::memset(Queue, Data1, 0x2A, N * sizeof(int));
+  exp_ext::fill(Queue, Data2, 7, N);
+
+  exp_ext::memcpy(Queue, Data1 + N, Data1, N * sizeof(int));
+  exp_ext::memcpy(Queue, HostData1.data(), Data1 + N, N * sizeof(int));
+
+  exp_ext::copy(Queue, Data2, Data2 + N, N);
+  exp_ext::copy(Queue, Data2 + N, HostData2.data(), N);
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  // Finalize and execute the graph
+  auto ExecutableGraph = Graph.finalize();
+
+  exp_ext::execute_graph(Queue, ExecutableGraph);
+
+  Queue.wait();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(check_value(i, 0x2A2A2A2A, HostData1[i], "Data1"));
+    assert(check_value(i, 7, HostData2[i], "Data2"));
+  }
+
+  free(Data1, Queue);
+  free(Data2, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp
new file mode 100644
index 0000000000000..baa1792fe0d6a
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue.cpp
@@ -0,0 +1,112 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test for enable_native_recording property with multi-queue dot product
+// Assesses event dependencies to, from, and within a native recording graph
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  // Create context and device
+  device Dev;
+  context Ctx{Dev};
+
+  // Create two in-order queues sharing the same device and context
+  queue Queue1{Ctx, Dev, {property::queue::in_order{}}};
+  queue Queue2{Ctx, Dev, {property::queue::in_order{}}};
+
+  QueueStateVerifier verifier(Queue1, Queue2);
+
+  exp_ext::command_graph Graph{
+      Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+
+  // Allocate input vectors and partial result buffers
+  int *VecA = malloc_device<int>(N, Dev, Ctx);
+  int *VecB = malloc_device<int>(N, Dev, Ctx);
+  int *PartialResult1 = malloc_device<int>(1, Dev, Ctx);
+  int *PartialResult2 = malloc_device<int>(1, Dev, Ctx);
+  int *FinalResult = malloc_device<int>(1, Dev, Ctx);
+
+  verifier.verify(EXECUTING, EXECUTING);
+
+  // Begin graph recording on Queue1 only
+  Graph.begin_recording(Queue1);
+  verifier.verify(RECORDING, EXECUTING);
+
+  // Transform VecA
+  event Fork =
+      Queue1.parallel_for(range<1>{N}, [=](item<1> idx) { VecA[idx] *= 2; });
+
+  // Record partial dot product on first half with Queue2, transitioning to
+  // recording (fork)
+  event Join = Queue2.single_task({Fork}, [=]() {
+    int sum = 0;
+    for (size_t i = 0; i < N / 2; i++) {
+      sum += VecA[i] * VecB[i];
+    }
+    PartialResult1[0] = sum;
+  });
+  verifier.verify(RECORDING, RECORDING);
+
+  // Record partial dot product on second half (Queue1)
+  exp_ext::single_task(Queue1, [=]() {
+    int sum = 0;
+    for (size_t i = N / 2; i < N; i++) {
+      sum += VecA[i] * VecB[i];
+    }
+    PartialResult2[0] = sum;
+  });
+
+  // Record final reduction kernel with dependency on Queue2 event
+  Queue1.single_task({Join}, [=]() {
+    FinalResult[0] = PartialResult1[0] + PartialResult2[0];
+  });
+
+  Graph.end_recording();
+  verifier.verify(EXECUTING, EXECUTING);
+
+  // Finalize and execute the graph
+  auto ExecutableGraph = Graph.finalize();
+
+  // Initialize input vector outside of graph. Use Queue2 to be able to test
+  // graph dependent event
+  event InitEvent = Queue2.parallel_for(range<1>{N}, [=](item<1> idx) {
+    VecA[idx] = static_cast<int>(idx);
+    VecB[idx] = static_cast<int>(idx) + 1;
+  });
+
+  auto GraphEvent = Queue1.ext_oneapi_graph(ExecutableGraph, {InitEvent});
+
+  // Wait for graph completion
+  GraphEvent.wait();
+
+  // Verify result
+  int HostResult = 0;
+  Queue1.memcpy(&HostResult, FinalResult, sizeof(int));
+  Queue1.wait();
+
+  // Compute expected result
+  int Expected = 0;
+  for (int i = 0; i < N; i++) {
+    Expected += 2 * i * (i + 1);
+  }
+
+  assert(check_value(0, Expected, HostResult, "DotProduct"));
+
+  free(VecA, Ctx);
+  free(VecB, Ctx);
+  free(PartialResult1, Ctx);
+  free(PartialResult2, Ctx);
+  free(FinalResult, Ctx);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp
new file mode 100644
index 0000000000000..bf79c40f82ce1
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/multi_queue_barrier.cpp
@@ -0,0 +1,109 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test for native recording with fork-join using barriers. Barriers are used to
+// record two independent streams of operations without any dependencies between
+// each other apart from the queue recording transition and join.
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+#include <algorithm>
+#include <vector>
+
+int main() {
+  device Dev;
+  context Ctx{Dev};
+
+  queue Queue1{Ctx, Dev, {property::queue::in_order{}}};
+  queue Queue2{Ctx, Dev, {property::queue::in_order{}}};
+
+  QueueStateVerifier verifier(Queue1, Queue2);
+
+  exp_ext::command_graph Graph{
+      Ctx, Dev, {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+
+  int *VecA = malloc_device<int>(N, Dev, Ctx);
+  int *VecB = malloc_device<int>(N, Dev, Ctx);
+
+  // Initialize both arrays to 555
+  Queue1.fill(VecA, 555, N).wait();
+  Queue1.fill(VecB, 555, N).wait();
+  verifier.verify(EXECUTING, EXECUTING);
+
+  // 1) record on Queue1
+  Graph.begin_recording(Queue1);
+  verifier.verify(RECORDING, EXECUTING);
+
+  // 2) barrier (ext_oneapi_barrier) and recording transition on Queue2
+  event ForkBarrier = Queue1.ext_oneapi_submit_barrier();
+  exp_ext::partial_barrier(Queue2, {ForkBarrier});
+  verifier.verify(RECORDING, RECORDING);
+
+  // 3) two streams of independent kernels on Queue1 and Queue2
+  exp_ext::parallel_for(Queue1, range<1>{N},
+                        [=](item<1> idx) { VecA[idx] = 1; });
+  exp_ext::parallel_for(Queue1, range<1>{N},
+                        [=](item<1> idx) { VecA[idx] += 1; });
+  exp_ext::parallel_for(Queue1, range<1>{N},
+                        [=](item<1> idx) { VecA[idx] *= 2; });
+
+  exp_ext::parallel_for(Queue2, range<1>{N},
+                        [=](item<1> idx) { VecB[idx] = 2; });
+  exp_ext::parallel_for(Queue2, range<1>{N},
+                        [=](item<1> idx) { VecB[idx] *= 3; });
+  exp_ext::parallel_for(Queue2, range<1>{N},
+                        [=](item<1> idx) { VecB[idx] += 1; });
+
+  // 4) join barrier
+  event JoinBarrier = Queue2.ext_oneapi_submit_barrier();
+  exp_ext::partial_barrier(Queue1, {JoinBarrier});
+
+  Graph.end_recording();
+  verifier.verify(EXECUTING, EXECUTING);
+
+  // Wait on both queues after recording
+  Queue1.wait();
+  Queue2.wait();
+
+  // Verify that values are still at initial value (no eager execution)
+  std::vector<int> HostVecA_Check(N);
+  std::vector<int> HostVecB_Check(N);
+  Queue1.memcpy(HostVecA_Check.data(), VecA, N * sizeof(int)).wait();
+  Queue1.memcpy(HostVecB_Check.data(), VecB, N * sizeof(int)).wait();
+
+  assert(std::count(HostVecA_Check.begin(), HostVecA_Check.end(), 555) == N);
+  assert(std::count(HostVecB_Check.begin(), HostVecB_Check.end(), 555) == N);
+
+  // Finalize and execute the graph
+  auto ExecutableGraph = Graph.finalize();
+
+  exp_ext::execute_graph(Queue1, ExecutableGraph);
+  Queue1.wait();
+
+  // Verify results
+  std::vector<int> HostVecA(N);
+  std::vector<int> HostVecB(N);
+
+  Queue1.memcpy(HostVecA.data(), VecA, N * sizeof(int)).wait();
+  Queue1.memcpy(HostVecB.data(), VecB, N * sizeof(int)).wait();
+
+  // VecA: 1 + 1 * 2 = 4
+  assert(std::count(HostVecA.begin(), HostVecA.end(), 4) == N);
+
+  // VecB: 2 * 3 + 1 = 7
+  assert(std::count(HostVecB.begin(), HostVecB.end(), 7) == N);
+
+  free(VecA, Ctx);
+  free(VecB, Ctx);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp
new file mode 100644
index 0000000000000..fea5be477cb03
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/pipeline_non_blocking.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s
+
+// Test for native recording with pipelined graph execution. This test verifies
+// that multiple graph executions can be batched with only a single host wait
+// call at the end, demonstrating that all graph submissions are non-blocking.
+
+#include "../../graph_common.hpp"
+
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+#include <iostream>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::enable_native_recording{}}};
+
+  const size_t N = 1024;
+  int *Data = malloc_device<int>(N, Queue);
+
+  QueueStateVerifier verifier(Queue);
+  verifier.verify(EXECUTING);
+
+  // Record graph with two kernels: add 3, then subtract 1
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  exp_ext::parallel_for(Queue, range<1>{N}, [=](id<1> idx) { Data[idx] += 3; });
+  exp_ext::parallel_for(Queue, range<1>{N}, [=](id<1> idx) { Data[idx] -= 1; });
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  auto ExecutableGraph = Graph.finalize();
+
+  // Initialize buffer, execute graph 20 times, copy results, then wait once
+  exp_ext::memset(Queue, Data, 0, N * sizeof(int));
+
+  std::cerr << "BEGIN_GRAPH_PIPELINE" << std::endl;
+  for (int i = 0; i < 20; i++) {
+    exp_ext::execute_graph(Queue, ExecutableGraph);
+  }
+
+  std::vector<int> HostData(N);
+  exp_ext::memcpy(Queue, HostData.data(), Data, N * sizeof(int));
+  Queue.wait();
+  std::cerr << "END_GRAPH_PIPELINE" << std::endl;
+
+  // Verify results: 20 iterations of (add 3, subtract 1) = 40
+  const int Expected = 40;
+  for (size_t i = 0; i < N; i++) {
+    assert(check_value(i, Expected, HostData[i], "HostData"));
+  }
+
+  free(Data, Queue);
+  return 0;
+}
+
+// Verify that there is exactly one host synchronization in the pipeline
+// execution. The graph executions and memcpy should be batched asynchronously
+// with only a single synchronization point at Queue.wait().
+//
+// At the L0 level, we should see exactly one zeCommandListHostSynchronize call
+// (from Queue.wait()) and zero zeEventHostSynchronize calls in the execution
+// region.
+//
+// CHECK-LABEL: BEGIN_GRAPH_PIPELINE
+// CHECK-NOT: zeEventHostSynchronize(
+// CHECK-COUNT-1: zeCommandListHostSynchronize(
+// CHECK-NOT: zeCommandListHostSynchronize(
+// CHECK: END_GRAPH_PIPELINE
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp
new file mode 100644
index 0000000000000..0e0bff07e173d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/work_group_scratch_memory.cpp
@@ -0,0 +1,9 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_NATIVE_RECORDING
+#include "../../Inputs/work_group_scratch_memory.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp
new file mode 100644
index 0000000000000..7e8fa31ffc86e
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_host_task.cpp
@@ -0,0 +1,91 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+// REQUIRES: level_zero_dev_kit
+
+// RUN: %{build} %level_zero_options -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#include "../../graph_common.hpp"
+#include "../../ze_common.hpp"
+
+#include <level_zero/ze_api.h>
+#include <sycl/properties/all_properties.hpp>
+
+constexpr size_t N = 1024;
+
+void ZE_APICALL HostFunction(void *UserData) {
+  uint32_t *Data = static_cast<uint32_t *>(UserData);
+  for (size_t i = 0; i < N; i++) {
+    Data[i] = Data[i] * 3;
+  }
+}
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  const sycl::context Context = Queue.get_context();
+  const sycl::device Device = Queue.get_device();
+
+  ze_driver_handle_t ZeDriver;
+  ASSERT_ZE_RESULT_SUCCESS(getDriver(ZeDriver));
+
+  zeCommandListAppendHostFunction_fn zeCommandListAppendHostFunction = nullptr;
+  ASSERT_ZE_RESULT_SUCCESS(
+      loadZeExtensionFunction(ZeDriver, "zeCommandListAppendHostFunction",
+                              zeCommandListAppendHostFunction));
+
+  // Allocate shared memory (accessible from both device and host)
+  uint32_t *DataShared = malloc_shared<uint32_t>(N, Queue);
+
+  // Get the command list before recording starts
+  ze_command_list_handle_t ZeCommandList;
+  bool success = getCommandListFromQueue(Queue, ZeCommandList);
+  assert(success);
+
+  exp_ext::command_graph Graph{
+      Context, Device, {exp_ext::property::graph::enable_native_recording{}}};
+
+  CommandListStateVerifier verifier(ZeCommandList);
+  verifier.verify(EXECUTING);
+
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  // 1. Record SYCL kernel - initialize data
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> idx) { DataShared[idx] = idx + 10; });
+  });
+
+  // 2. Record SYCL kernel - multiply by 2
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> idx) { DataShared[idx] = DataShared[idx] * 2; });
+  });
+
+  // 3. Record L0 host function directly to the recording command list
+  // The host function will run on the host but is part of the command list,
+  // so it will execute after the kernels complete
+
+  ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendHostFunction(
+      ZeCommandList, reinterpret_cast<void *>(HostFunction),
+      static_cast<void *>(DataShared), nullptr, nullptr, 0, nullptr));
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  auto ExecutableGraph = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+  Queue.wait();
+
+  // Verify results
+  // SYCL kernel: Data = (i + 10) * 2
+  // Host function: Data = (i + 10) * 2 * 3
+  for (size_t i = 0; i < N; i++) {
+    uint32_t Expected = (i + 10) * 2 * 3;
+    assert(check_value(i, Expected, DataShared[i], "DataShared"));
+  }
+
+  free(DataShared, Queue);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp
new file mode 100644
index 0000000000000..068b9b6aaa4ee
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_launch_kernel_with_args.cpp
@@ -0,0 +1,108 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+// REQUIRES: level_zero_dev_kit
+
+// RUN: %{build} %level_zero_options -o %t.out
+// RUN: %{run} %t.out %S/../../Inputs/Kernels/saxpy.spv
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out %S/../../Inputs/Kernels/saxpy.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test native recording with intermixed SYCL and Level-Zero kernels.
+
+#include "../../graph_common.hpp"
+#include "../../ze_common.hpp"
+
+#include <level_zero/ze_api.h>
+#include <sycl/properties/all_properties.hpp>
+
+int main(int, char **argv) {
+  queue Queue{property::queue::in_order{}};
+
+  const sycl::context Context = Queue.get_context();
+  const sycl::device Device = Queue.get_device();
+
+  const size_t N = 1024;
+  uint32_t *DataX = malloc_device<uint32_t>(N, Queue);
+  uint32_t *DataZ = malloc_device<uint32_t>(N, Queue);
+
+  std::vector<uint32_t> HostX(N);
+  std::vector<uint32_t> HostZ(N);
+  for (size_t i = 0; i < N; i++) {
+    HostX[i] = i + 10;
+    HostZ[i] = i + 1;
+  }
+
+  Queue.memcpy(DataX, HostX.data(), N * sizeof(uint32_t)).wait();
+  Queue.memcpy(DataZ, HostZ.data(), N * sizeof(uint32_t)).wait();
+
+  ZeKernelFactory KernelFactory(Queue);
+  ze_module_handle_t ZeModule =
+      KernelFactory.createModule(loadSpirvFromFile(argv[1]));
+  ze_kernel_handle_t ZeKernel = KernelFactory.createKernel(
+      ZeModule, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E5saxpy");
+
+  exp_ext::command_graph Graph{
+      Context, Device, {exp_ext::property::graph::enable_native_recording{}}};
+
+  ze_command_list_handle_t ZeCommandList;
+  bool result = getCommandListFromQueue(Queue, ZeCommandList);
+  assert(result);
+
+  CommandListStateVerifier verifier(ZeCommandList);
+  verifier.verify(EXECUTING);
+
+  // Begin recording
+  Graph.begin_recording(Queue);
+  verifier.verify(RECORDING);
+
+  // 1. Record SYCL kernel - multiply X by 2
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> idx) { DataX[idx] = DataX[idx] * 2; });
+  });
+
+  // 2. Record L0 kernel directly to the recording command list
+
+  // Suggest and prepare group size
+  uint32_t GroupSizeX;
+  uint32_t GroupSizeY;
+  uint32_t GroupSizeZ;
+
+  ASSERT_ZE_RESULT_SUCCESS(zeKernelSuggestGroupSize(
+      ZeKernel, N, 1, 1, &GroupSizeX, &GroupSizeY, &GroupSizeZ));
+
+  // Prepare kernel arguments using the WithArguments API
+  // saxpy computes Z = X * 2 + Z
+  // Arguments are passed as an array of pointers
+  void *ArgPointers[] = {&DataZ, &DataX};
+
+  ze_group_count_t ZeGroupCount{static_cast<uint32_t>(N) / GroupSizeX, 1, 1};
+  ze_group_size_t ZeGroupSize{GroupSizeX, GroupSizeY, GroupSizeZ};
+
+  ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendLaunchKernelWithArguments(
+      ZeCommandList, ZeKernel, ZeGroupCount, ZeGroupSize, ArgPointers, nullptr,
+      nullptr, 0, nullptr));
+
+  Graph.end_recording(Queue);
+  verifier.verify(EXECUTING);
+
+  auto ExecutableGraph = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+  Queue.wait();
+
+  Queue.memcpy(HostX.data(), DataX, N * sizeof(uint32_t)).wait();
+  Queue.memcpy(HostZ.data(), DataZ, N * sizeof(uint32_t)).wait();
+
+  // Verify results
+  // SYCL kernel: X = (i + 10) * 2
+  // L0 saxpy kernel: Z = X * 2 + Z = (i + 10) * 2 * 2 + (i + 1)
+  for (size_t i = 0; i < N; i++) {
+    uint32_t ExpectedX = (i + 10) * 2;
+    uint32_t ExpectedZ = (i + 10) * 2 * 2 + (i + 1);
+    assert(check_value(i, ExpectedX, HostX[i], "HostX"));
+    assert(check_value(i, ExpectedZ, HostZ[i], "HostZ"));
+  }
+
+  free(DataX, Queue);
+  free(DataZ, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp
new file mode 100644
index 0000000000000..866579304315d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/ze_mem_ops.cpp
@@ -0,0 +1,81 @@
+// REQUIRES: level_zero_v2_adapter && arch-intel_gpu_bmg_g21
+// REQUIRES: level_zero_dev_kit
+
+// RUN: %{build} %level_zero_options -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Test native recording with Level Zero memory operations.
+// Records L0 memset, device-to-device copy, and device-to-host copy
+// directly to the recording command list over an in-order queue.
+
+#include "../../graph_common.hpp"
+#include "../../ze_common.hpp"
+
+#include <level_zero/ze_api.h>
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  queue Queue{property::queue::in_order{}};
+
+  const sycl::context Context = Queue.get_context();
+  const sycl::device Device = Queue.get_device();
+
+  const size_t N = 1024;
+  uint32_t *DeviceSrc = malloc_device<uint32_t>(N, Queue);
+  uint32_t *DeviceDst = malloc_device<uint32_t>(N, Queue);
+  uint32_t *HostDst = malloc_host<uint32_t>(N, Queue);
+
+  for (size_t i = 0; i < N; i++) {
+    HostDst[i] = 0;
+  }
+
+  ze_command_list_handle_t ZeCommandList;
+  bool success = getCommandListFromQueue(Queue, ZeCommandList);
+  assert(success);
+
+  exp_ext::command_graph Graph{
+      Context, Device, {exp_ext::property::graph::enable_native_recording{}}};
+
+  CommandListStateVerifier verifier(ZeCommandList);
+  verifier.verify(EXECUTING);
+
+  Graph.begin_recording(Queue);
+
+  verifier.verify(RECORDING);
+
+  // 1. Level Zero memset - fill DeviceSrc with pattern 0x42 (byte pattern)
+  uint32_t Pattern = 0x42;
+  ASSERT_ZE_RESULT_SUCCESS(zeCommandListAppendMemoryFill(
+      ZeCommandList, DeviceSrc, &Pattern, sizeof(uint32_t),
+      N * sizeof(uint32_t), nullptr, 0, nullptr));
+
+  // 2. Level Zero device-to-device copy - copy DeviceSrc to DeviceDst
+  ASSERT_ZE_RESULT_SUCCESS(
+      zeCommandListAppendMemoryCopy(ZeCommandList, DeviceDst, DeviceSrc,
+                                    N * sizeof(uint32_t), nullptr, 0, nullptr));
+
+  // 3. Level Zero device-to-host copy - copy DeviceDst to HostDst
+  ASSERT_ZE_RESULT_SUCCESS(
+      zeCommandListAppendMemoryCopy(ZeCommandList, HostDst, DeviceDst,
+                                    N * sizeof(uint32_t), nullptr, 0, nullptr));
+
+  Graph.end_recording(Queue);
+
+  verifier.verify(EXECUTING);
+
+  auto ExecutableGraph = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); });
+  Queue.wait();
+
+  // Verify results on host
+  for (size_t i = 0; i < N; i++) {
+    assert(check_value(i, Pattern, HostDst[i], "HostDst"));
+  }
+
+  free(DeviceSrc, Queue);
+  free(DeviceDst, Queue);
+  free(HostDst, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/empty.cpp
new file mode 100644
index 0000000000000..c73141de90611
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/empty.cpp
@@ -0,0 +1,8 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/empty.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp
index a3f01010dcb90..ab8fae8f59b0f 100644
--- a/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/work_group_scratch_memory.cpp
@@ -13,76 +13,4 @@
 // XFAIL: hip
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16072
 
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/experimental/graph.hpp>
-#include <sycl/ext/oneapi/work_group_scratch_memory.hpp>
-#include <sycl/group_barrier.hpp>
-#include <sycl/properties/all_properties.hpp>
-#include <sycl/usm.hpp>
-
-using DataType = int;
-
-namespace sycl_ext = sycl::ext::oneapi::experimental;
-
-void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) {
-  // one work-group copies data to shared memory from A
-  // And then puts in back into B
-
-  DataType *smem_ptr =
-      reinterpret_cast<DataType *>(sycl_ext::get_work_group_scratch_memory());
-  auto threadIdx_x = it.get_local_linear_id();
-
-  smem_ptr[threadIdx_x] = a[threadIdx_x];
-  sycl::group_barrier(it.get_group());
-
-  b[threadIdx_x] = smem_ptr[threadIdx_x];
-}
-
-template <typename T> struct KernelFunctor {
-  T m_props;
-  DataType *m_a;
-  DataType *m_b;
-  KernelFunctor(T props, DataType *a, DataType *b)
-      : m_props(props), m_a(a), m_b(b) {}
-
-  void operator()(sycl::nd_item<1> it) const { copy_via_smem(m_a, m_b, it); }
-  auto get(sycl_ext::properties_tag) const { return m_props; }
-};
-
-int main() {
-  sycl::queue queue{sycl::property::queue::in_order()};
-  sycl::ext::oneapi::experimental::command_graph graph(queue.get_device());
-
-  auto size = std::min(
-      queue.get_device().get_info<sycl::info::device::max_work_group_size>(),
-      size_t{1024});
-
-  DataType *a = sycl::malloc_device<DataType>(size, queue);
-  DataType *b = sycl::malloc_device<DataType>(size, queue);
-  std::vector<DataType> a_host(size, 1.0);
-  std::vector<DataType> b_host(size, -5.0);
-
-  graph.begin_recording(queue);
-  queue.copy(a_host.data(), a, size);
-
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<1>({size}, {size}),
-        KernelFunctor(sycl_ext::properties{sycl_ext::work_group_scratch_size(
-                          size * sizeof(DataType))},
-                      a, b));
-  });
-
-  queue.copy(b, b_host.data(), size);
-  graph.end_recording();
-  auto exec_graph = graph.finalize();
-
-  queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_graph(exec_graph); })
-      .wait_and_throw();
-
-  for (size_t i = 0; i < b_host.size(); i++) {
-    assert(b_host[i] == a_host[i]);
-  }
-  sycl::free(a, queue);
-  sycl::free(b, queue);
-}
+#include "../Inputs/work_group_scratch_memory.cpp"
diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp
index c99782732ec21..fb6da631bdc4e 100644
--- a/sycl/test-e2e/Graph/graph_common.hpp
+++ b/sycl/test-e2e/Graph/graph_common.hpp
@@ -6,10 +6,12 @@
 
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 
+#include <array>
 #include <condition_variable> // std::conditional_variable
 #include <fstream>
 #include <mutex> // std::mutex, std::unique_lock
 #include <numeric>
+#include <utility>
 
 #if GRAPH_TESTS_VERBOSE_PRINT
 #include <chrono>
@@ -25,6 +27,12 @@ namespace exp_ext = sycl::ext::oneapi::experimental;
 // Make tests less verbose by using sycl namespace.
 using namespace sycl;
 
+// Queue state constants for recording tests
+inline constexpr exp_ext::queue_state RECORDING =
+    exp_ext::queue_state::recording;
+inline constexpr exp_ext::queue_state EXECUTING =
+    exp_ext::queue_state::executing;
+
 // Helper functions for wrapping depends_on calls when add_node is used so they
 // are not used in the explicit API
 template <typename T> inline void depends_on_helper(sycl::handler &CGH, T Dep) {
@@ -442,6 +450,42 @@ class Barrier {
   std::size_t threadNum;
 };
 
+// Verify recording states of one or more queues
+template <size_t N> class QueueStateVerifier {
+  std::array<queue, N> queues;
+
+public:
+  template <typename... Queues>
+  QueueStateVerifier(Queues... qs) : queues{qs...} {}
+
+  template <typename... States> void verify(States... expected_states) {
+    verifyImpl(std::index_sequence_for<States...>{}, expected_states...);
+  }
+
+private:
+  template <size_t... Is, typename... States>
+  void verifyImpl(std::index_sequence<Is...>, States... expected_states) {
+    (checkQueue(Is, queues[Is], expected_states), ...);
+  }
+
+  void checkQueue(size_t index, queue q, exp_ext::queue_state expected) {
+    auto actual = q.ext_oneapi_get_state();
+    if (actual != expected) {
+      std::cerr << "Queue " << index << " SYCL state mismatch: expected "
+                << stateToString(expected) << " but got "
+                << stateToString(actual) << std::endl;
+      assert(false);
+    }
+  }
+
+  const char *stateToString(exp_ext::queue_state state) {
+    return state == exp_ext::queue_state::recording ? "recording" : "executing";
+  }
+};
+
+template <typename... Queues>
+QueueStateVerifier(Queues...) -> QueueStateVerifier<sizeof...(Queues)>;
+
 template <typename T>
 bool inline check_value(const T &Ref, const T &Got,
                         const std::string &VariableName) {
@@ -545,3 +589,25 @@ bool compareProfiling(event Event1, event Event2) {
 
   return (Pass1 && Pass2);
 }
+
+/// Helper to test that a callable throws a sycl::exception with the given
+/// error code. Returns true if the exception was thrown with the expected
+/// code, false otherwise.
+template <typename Func>
+bool expectException(Func &&Operation, const char *OperationName,
+                     sycl::errc ExpectedCode) {
+  try {
+    Operation();
+    std::cerr << "ERROR: Expected exception was not thrown for "
+              << OperationName << std::endl;
+    return false;
+  } catch (const sycl::exception &e) {
+    if (e.code() != sycl::make_error_code(ExpectedCode)) {
+      std::cerr << "ERROR: Wrong exception error code for " << OperationName
+                << ": expected " << sycl::make_error_code(ExpectedCode).message()
+                << ", got " << e.code().message() << std::endl;
+      return false;
+    }
+    return true;
+  }
+}
diff --git a/sycl/test-e2e/Graph/ze_common.hpp b/sycl/test-e2e/Graph/ze_common.hpp
new file mode 100644
index 0000000000000..9bd1c87cd6699
--- /dev/null
+++ b/sycl/test-e2e/Graph/ze_common.hpp
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#define ASSERT_ZE_RESULT_SUCCESS(status) assert((status) == ZE_RESULT_SUCCESS);
+
+// These are currently defined in experimental graph headers. If they are aren't
+// defined, then define them manually.
+#ifndef ZE_RESULT_QUERY_TRUE
+#define ZE_RESULT_QUERY_TRUE ((ze_result_t)0x7fff0000)
+#endif
+#ifndef ZE_RESULT_QUERY_FALSE
+#define ZE_RESULT_QUERY_FALSE ((ze_result_t)0x7fff0001)
+#endif
+
+inline ze_result_t getDriver(ze_driver_handle_t &ZeDriver) {
+  uint32_t DriverCount = 0;
+  ze_result_t status = zeDriverGet(&DriverCount, nullptr);
+  if (status != ZE_RESULT_SUCCESS) {
+    return status;
+  }
+
+  if (DriverCount == 0) {
+    std::cout << "No Level Zero drivers found" << std::endl;
+    return ZE_RESULT_ERROR_DEVICE_LOST;
+  }
+
+  std::vector<ze_driver_handle_t> Drivers(DriverCount);
+  status = zeDriverGet(&DriverCount, Drivers.data());
+  ZeDriver = Drivers[0];
+  return status;
+}
+
+inline std::vector<uint8_t> loadSpirvFromFile(const std::string &FileName) {
+  std::ifstream SpvStream(FileName, std::ios::binary);
+  SpvStream.seekg(0, std::ios::end);
+  size_t sz = SpvStream.tellg();
+  SpvStream.seekg(0);
+  std::vector<uint8_t> Spv(sz);
+  SpvStream.read(reinterpret_cast<char *>(Spv.data()), sz);
+  return Spv;
+}
+
+inline bool getCommandListFromQueue(sycl::queue &Queue,
+                                    ze_command_list_handle_t &ZeCommandList) {
+  using namespace sycl;
+  auto ZeQueueNative = get_native<backend::ext_oneapi_level_zero>(Queue);
+
+  if (!std::holds_alternative<ze_command_list_handle_t>(ZeQueueNative)) {
+    return false;
+  }
+
+  ZeCommandList = std::get<ze_command_list_handle_t>(ZeQueueNative);
+  return true;
+}
+
+typedef ze_result_t(ZE_APICALL *zeCommandListAppendHostFunction_fn)(
+    ze_command_list_handle_t, void *, void *, void *, ze_event_handle_t,
+    uint32_t, ze_event_handle_t *);
+
+typedef ze_result_t(ZE_APICALL *zeCommandListIsGraphCaptureEnabledExp_fn)(
+    ze_command_list_handle_t);
+
+template <typename FunctionPtr>
+inline ze_result_t loadZeExtensionFunction(ze_driver_handle_t ZeDriver,
+                                           const char *FunctionName,
+                                           FunctionPtr &Fn) {
+  ze_result_t status = zeDriverGetExtensionFunctionAddress(
+      ZeDriver, FunctionName, reinterpret_cast<void **>(&Fn));
+  return status;
+}
+
+// Factory for creating and managing Level Zero kernels and modules
+// All resources are associated with the same context and device
+// The factory stores a copy of the SYCL queue to ensure the underlying
+// Level Zero context and device remain valid for the factory's lifetime
+class ZeKernelFactory {
+public:
+  explicit ZeKernelFactory(sycl::queue Queue)
+      : Queue(Queue),
+        Context(sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+            Queue.get_context())),
+        Device(sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+            Queue.get_device())) {}
+
+  ~ZeKernelFactory() { cleanup(); }
+
+  ZeKernelFactory() = delete;
+  ZeKernelFactory(const ZeKernelFactory &) = delete;
+  ZeKernelFactory &operator=(const ZeKernelFactory &) = delete;
+  ZeKernelFactory(ZeKernelFactory &&) noexcept = default;
+  ZeKernelFactory &operator=(ZeKernelFactory &&) noexcept = default;
+
+  ze_module_handle_t createModule(const std::vector<uint8_t> &Spirv) {
+    ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+                                   nullptr,
+                                   ZE_MODULE_FORMAT_IL_SPIRV,
+                                   Spirv.size(),
+                                   Spirv.data(),
+                                   nullptr,
+                                   nullptr};
+    ze_module_handle_t module;
+    ze_result_t status =
+        zeModuleCreate(Context, Device, &moduleDesc, &module, nullptr);
+    ASSERT_ZE_RESULT_SUCCESS(status);
+    Modules.push_back(module);
+    return module;
+  }
+
+  ze_kernel_handle_t createKernel(ze_module_handle_t Module,
+                                  const char *KernelName) {
+    ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+                                   KernelName};
+    ze_kernel_handle_t kernel;
+    ze_result_t status = zeKernelCreate(Module, &kernelDesc, &kernel);
+    ASSERT_ZE_RESULT_SUCCESS(status);
+    Kernels.push_back(kernel);
+    return kernel;
+  }
+
+  void cleanup() {
+    // Destroy kernels first (they depend on modules)
+    for (auto kernel : Kernels) {
+      ASSERT_ZE_RESULT_SUCCESS(zeKernelDestroy(kernel));
+    }
+    Kernels.clear();
+
+    // Then destroy modules
+    for (auto module : Modules) {
+      ASSERT_ZE_RESULT_SUCCESS(zeModuleDestroy(module));
+    }
+    Modules.clear();
+  }
+
+private:
+  sycl::queue Queue; // Ensures context and device lifetime
+  ze_context_handle_t Context;
+  ze_device_handle_t Device;
+  std::vector<ze_module_handle_t> Modules;
+  std::vector<ze_kernel_handle_t> Kernels;
+};
+
+// Verify recording states of one or more command lists
+template <size_t N> class CommandListStateVerifier {
+  std::array<ze_command_list_handle_t, N> commandLists;
+  zeCommandListIsGraphCaptureEnabledExp_fn pfnIsGraphCaptureEnabled = nullptr;
+
+public:
+  template <typename... CommandLists>
+  CommandListStateVerifier(CommandLists... cmdLists)
+      : commandLists{cmdLists...} {
+    loadGraphIsCapturingExtension();
+  }
+
+  template <typename... States> void verify(States... expected_states) {
+    verifyImpl(std::index_sequence_for<States...>{}, expected_states...);
+  }
+
+private:
+  void loadGraphIsCapturingExtension() {
+    ze_driver_handle_t driver;
+    ASSERT_ZE_RESULT_SUCCESS(getDriver(driver));
+    ASSERT_ZE_RESULT_SUCCESS(
+        loadZeExtensionFunction(driver, "zeCommandListIsGraphCaptureEnabledExp",
+                                pfnIsGraphCaptureEnabled));
+  }
+
+  template <size_t... Is, typename... States>
+  void verifyImpl(std::index_sequence<Is...>, States... expected_states) {
+    (checkCommandList(Is, commandLists[Is], expected_states), ...);
+  }
+
+  void checkCommandList(size_t index, ze_command_list_handle_t cmdList,
+                        exp_ext::queue_state expected) {
+    exp_ext::queue_state actual = getCommandListState(cmdList);
+
+    if (actual != expected) {
+      std::cerr << "CommandList " << index << " L0 state mismatch: expected "
+                << stateToString(expected) << " but got "
+                << stateToString(actual) << std::endl;
+      assert(false);
+    }
+  }
+
+  exp_ext::queue_state
+  getCommandListState(ze_command_list_handle_t cmdList) const {
+    ze_result_t captureStatus = pfnIsGraphCaptureEnabled(cmdList);
+    return (captureStatus == ZE_RESULT_QUERY_TRUE)
+               ? exp_ext::queue_state::recording
+               : exp_ext::queue_state::executing;
+  }
+
+  const char *stateToString(exp_ext::queue_state state) {
+    return state == exp_ext::queue_state::recording ? "recording" : "executing";
+  }
+};
+
+template <typename... CommandLists>
+CommandListStateVerifier(CommandLists...)
+    -> CommandListStateVerifier<sizeof...(CommandLists)>;
diff --git a/sycl/test/basic_tests/property_traits.cpp b/sycl/test/basic_tests/property_traits.cpp
index 808ba179b36f1..5d276cef62001 100644
--- a/sycl/test/basic_tests/property_traits.cpp
+++ b/sycl/test/basic_tests/property_traits.cpp
@@ -82,6 +82,8 @@ int main() {
   CHECK_IS_PROPERTY(ext::oneapi::experimental::property::graph::updatable);
   CHECK_IS_PROPERTY(
       ext::oneapi::experimental::property::graph::enable_profiling);
+  CHECK_IS_PROPERTY(
+      ext::oneapi::experimental::property::graph::enable_native_recording);
 
   // Node is_property
   CHECK_IS_PROPERTY(
@@ -149,6 +151,10 @@ int main() {
       ext::oneapi::experimental::property::graph::enable_profiling,
       ext::oneapi::experimental::command_graph<
           ext::oneapi::experimental::graph_state::modifiable>);
+  CHECK_IS_PROPERTY_OF(
+      ext::oneapi::experimental::property::graph::enable_native_recording,
+      ext::oneapi::experimental::command_graph<
+          ext::oneapi::experimental::graph_state::modifiable>);
 
   // Node is_property_of
   CHECK_IS_PROPERTY_OF(
@@ -193,6 +199,9 @@ int main() {
   CHECK_IS_NOT_PROPERTY_OF(
       ext::oneapi::experimental::property::graph::enable_profiling,
       NotASYCLObject);
+  CHECK_IS_NOT_PROPERTY_OF(
+      ext::oneapi::experimental::property::graph::enable_native_recording,
+      NotASYCLObject);
 
   CHECK_IS_NOT_PROPERTY_OF(
       ext::oneapi::experimental::property::node::depends_on_all_leaves,
@@ -252,6 +261,8 @@ int main() {
   CHECK_IS_PROPERTY_V(ext::oneapi::experimental::property::graph::updatable);
   CHECK_IS_PROPERTY_V(
       ext::oneapi::experimental::property::graph::enable_profiling);
+  CHECK_IS_PROPERTY_V(
+      ext::oneapi::experimental::property::graph::enable_native_recording);
 
   // Node is_property_v
   CHECK_IS_PROPERTY_V(
@@ -340,6 +351,10 @@ int main() {
       ext::oneapi::experimental::property::graph::enable_profiling,
       ext::oneapi::experimental::command_graph<
           ext::oneapi::experimental::graph_state::modifiable>);
+  CHECK_IS_PROPERTY_OF_V(
+      ext::oneapi::experimental::property::graph::enable_native_recording,
+      ext::oneapi::experimental::command_graph<
+          ext::oneapi::experimental::graph_state::modifiable>);
 
   // Node is_property_of_v
   CHECK_IS_PROPERTY_OF_V(
@@ -384,6 +399,9 @@ int main() {
   CHECK_IS_NOT_PROPERTY_OF_V(
       ext::oneapi::experimental::property::graph::enable_profiling,
       NotASYCLObject);
+  CHECK_IS_NOT_PROPERTY_OF_V(
+      ext::oneapi::experimental::property::graph::enable_native_recording,
+      NotASYCLObject);
 
   CHECK_IS_NOT_PROPERTY_OF_V(
       ext::oneapi::experimental::property::node::depends_on_all_leaves,
diff --git a/sycl/tools/abi_check.py b/sycl/tools/abi_check.py
index 2b285e8917c98..9df8658619c67 100644
--- a/sycl/tools/abi_check.py
+++ b/sycl/tools/abi_check.py
@@ -205,7 +205,11 @@ def main():
         if args.output is None:
             print("Please specify --output option. Quiting.")
             sys.exit(-2)
-        dump_symbols(args.target_library, args.output)
+        try:
+            dump_symbols(args.target_library, args.output)
+        except FileNotFoundError as e:
+            print(f"Could not find file named: {e.filename}")
+            sys.exit(-2)
 
 
 if __name__ == "__main__":
diff --git a/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp b/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp
index 35de3c0efef94..ebff77b6248a9 100644
--- a/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/event_provider_counter.cpp
@@ -52,9 +52,9 @@ static zex_counter_based_event_exp_flags_t createZeFlags(queue_type queueType,
 
   if (queueType == QUEUE_IMMEDIATE) {
     zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE;
-  } else {
-    zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE;
   }
+  // Always set non immediate flag for compatibility with graph record & replay
+  zeFlags |= ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE;
 
   return zeFlags;
 }