From cbd94505a7a51c46e833a2f00ff424da4a03d1bb Mon Sep 17 00:00:00 2001
From: Lukasz Dorau <lukasz.dorau@intel.com>
Date: Tue, 31 Mar 2026 16:07:32 +0000
Subject: [PATCH 1/2] [UR][L0v2] Fix double zeCommandListClose in batched queue
 flush

Root cause
----------
ur_queue_batched_t::queueFlushUnlocked is called after every
enqueueUSMHostAllocExp / enqueueUSMSharedAllocExp / enqueueUSMDeviceAllocExp
/ enqueueUSMFreeExp to eagerly submit the current batch. The call sequence
is:

  queueFlushUnlocked
    enqueueCurrentBatchUnlocked()   <- (1) zeCommandListClose
                                       + zeCommandListImmediateAppendCommandListsExp
    renewBatchUnlocked()
      if runBatches.size() >= initialSlotsForBatches (10):
        queueFinishUnlocked()
          if !isActiveBatchEmpty(): <- true: enqueuedOperationsCounter > 0
            enqueueCurrentBatchUnlocked() <- (2) BUG: double zeCommandListClose
                                                  + double submit
          hostSynchronize()         <- may hang on some driver versions

enqueuedOperationsCounter is incremented by markIssuedCommandInBatch before
queueFlushUnlocked is called, but it is not cleared by
enqueueCurrentBatchUnlocked, so queueFinishUnlocked's !isActiveBatchEmpty()
guard does not protect against the re-entry.

With 2 queues and 256 iterations the bug fires ~92 times. On certain GPU
driver versions the immediate command list enters a state from which
zeCommandListHostSynchronize never returns, hanging the test:

  UR_ADAPTERS_FORCE_LOAD=lib/libur_adapter_level_zero_v2.so \
  UR_L0_V2_FORCE_BATCHED=1 \
  ./test/adapters/level_zero/enqueue_alloc-test \
  --gtest_filter=*urL0EnqueueAllocMultiQueueSameDeviceTest.SuccessMt*

Fix
---
Remove the pre-close of the active batch from queueFlushUnlocked and move
enqueueCurrentBatchUnlocked() into renewBatchUnlocked's else branch.
This ensures that when the batch-slot limit is reached the active batch is
still open, so the existing delegation to queueFinishUnlocked closes and
submits it exactly once via its !isActiveBatchEmpty() guard:

  renewBatchUnlocked()
    if runBatches.size() >= initialSlotsForBatches (10):
      queueFinishUnlocked()
        if !isActiveBatchEmpty(): <- closes + submits exactly once
          enqueueCurrentBatchUnlocked()
        hostSynchronize()
        queueFinishPoolsUnlocked()
        batchFinish()
    else:
      enqueueCurrentBatchUnlocked()  <- normal path
      renewRegularUnlocked()

This keeps all finish logic inside queueFinishUnlocked, making the code
easier to maintain and less prone to bugs if queueFinishUnlocked changes.

With queueFlushUnlocked reduced to a single call to renewBatchUnlocked,
the wrapper is no longer needed. Remove it and call renewBatchUnlocked
directly at all former call sites.

Tested-by:
  UR_ADAPTERS_FORCE_LOAD=lib/libur_adapter_level_zero_v2.so \
  UR_L0_V2_FORCE_BATCHED=1 \
  ./test/adapters/level_zero/enqueue_alloc-test \
  (81 passed, 6 skipped, 0 failed)

Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
---
 .../adapters/level_zero/v2/queue_batched.cpp  | 30 ++++++++-----------
 .../adapters/level_zero/v2/queue_batched.hpp  |  2 --
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp
index 3ed9132d007a8..4c98d749281cc 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp
@@ -125,6 +125,7 @@ ur_queue_batched_t::renewBatchUnlocked(locked<batch_manager> &batchLocked) {
   if (batchLocked->isLimitOfUsedCommandListsReached()) {
     return queueFinishUnlocked(batchLocked);
   } else {
+    UR_CALL(batchLocked->enqueueCurrentBatchUnlocked());
     return batchLocked->renewRegularUnlocked(getNewRegularCmdList());
   }
 }
@@ -157,7 +158,7 @@ ur_queue_batched_t::onEventWaitListUse(ur_event_generation_t batch_generation) {
 
   auto batchLocked = currentCmdLists.lock();
   if (batchLocked->isCurrentGeneration(batch_generation)) {
-    return queueFlushUnlocked(batchLocked);
+    return renewBatchUnlocked(batchLocked);
   } else {
     return UR_RESULT_SUCCESS;
   }
@@ -228,7 +229,9 @@ ur_result_t batch_manager::batchFinish() {
   UR_CALL(activeBatch.releaseSubmittedKernels());
 
   if (!isActiveBatchEmpty()) {
-    // Should have been enqueued as part of queueFinishUnlocked
+    // The active batch was already submitted to the immediate command list
+    // by queueFinishUnlocked. Reset it here so it is ready to record new
+    // commands.
     TRACK_SCOPE_LATENCY("ur_queue_batched_t::resetRegCmdlist");
     ZE2UR_CALL(zeCommandListReset, (activeBatch.getZeCommandList()));
 
@@ -432,7 +435,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMFreeExp(
       createEventIfRequestedRegular(phEvent,
                                     lockedBatch->getCurrentGeneration())));
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t ur_queue_batched_t::enqueueMemBufferMap(
@@ -634,7 +637,7 @@ ur_result_t ur_queue_batched_t::enqueueEventsWaitWithBarrier(
                           phEvent, lockedBatch->getCurrentGeneration())));
   }
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t
@@ -652,7 +655,7 @@ ur_queue_batched_t::enqueueEventsWait(uint32_t numEventsInWaitList,
       waitListView, createEventIfRequestedRegular(
                         phEvent, lockedBatch->getCurrentGeneration())));
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t ur_queue_batched_t::enqueueMemBufferCopy(
@@ -818,7 +821,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMDeviceAllocExp(
                                     lockedBatch->getCurrentGeneration()),
       UR_USM_TYPE_DEVICE));
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t ur_queue_batched_t::enqueueUSMSharedAllocExp(
@@ -840,7 +843,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMSharedAllocExp(
                                     lockedBatch->getCurrentGeneration()),
       UR_USM_TYPE_SHARED));
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t ur_queue_batched_t::enqueueUSMHostAllocExp(
@@ -861,7 +864,7 @@ ur_result_t ur_queue_batched_t::enqueueUSMHostAllocExp(
                                     lockedBatch->getCurrentGeneration()),
       UR_USM_TYPE_HOST));
 
-  return queueFlushUnlocked(lockedBatch);
+  return renewBatchUnlocked(lockedBatch);
 }
 
 ur_result_t ur_queue_batched_t::bindlessImagesImageCopyExp(
@@ -969,7 +972,7 @@ ur_result_t ur_queue_batched_t::enqueueCommandBufferExp(
   // command buffer batch (also a regular list) to preserve the order of
   // operations
   if (!lockedBatch->isActiveBatchEmpty()) {
-    UR_CALL(queueFlushUnlocked(lockedBatch));
+    UR_CALL(renewBatchUnlocked(lockedBatch));
   }
 
   // Regular lists cannot be appended to other regular lists for execution, only
@@ -1075,20 +1078,13 @@ ur_queue_batched_t::queueGetNativeHandle(ur_queue_native_desc_t * /*pDesc*/,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t
-ur_queue_batched_t::queueFlushUnlocked(locked<batch_manager> &batchLocked) {
-  UR_CALL(batchLocked->enqueueCurrentBatchUnlocked());
-
-  return renewBatchUnlocked(batchLocked);
-}
-
 ur_result_t ur_queue_batched_t::queueFlush() {
   auto batchLocked = currentCmdLists.lock();
 
   if (batchLocked->isActiveBatchEmpty()) {
     return UR_RESULT_SUCCESS;
   } else {
-    return queueFlushUnlocked(batchLocked);
+    return renewBatchUnlocked(batchLocked);
   }
 }
 
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp
index df1adab5bc838..0f1d0932f227b 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp
@@ -199,8 +199,6 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ {
 
   ur_result_t queueFinishUnlocked(locked<batch_manager> &batchLocked);
 
-  ur_result_t queueFlushUnlocked(locked<batch_manager> &batchLocked);
-
   ur_result_t markIssuedCommandInBatch(locked<batch_manager> &batchLocked);
 
 public:

From c7c9a6ffd113664ff27ffce435e48fafcbb6d79c Mon Sep 17 00:00:00 2001
From: Lukasz Dorau <lukasz.dorau@intel.com>
Date: Tue, 24 Feb 2026 10:54:31 +0000
Subject: [PATCH 2/2] [UR][L0v2] Parameterize
 urL0EnqueueAllocMultiQueueSameDeviceTest over all queue types

Enable the urL0EnqueueAllocMultiQueueSameDeviceTest and parameterize it
over all queue submission modes (UR_QUEUE_FLAG_SUBMISSION_BATCHED and
UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE) by:

- Removing SKIP_IF_BATCHED_QUEUE to enable the test for batched queues.
- Changing the base class template parameter from
  EnqueueAllocMultiQueueTestParam to
  uur::MultiQueueParam<EnqueueAllocMultiQueueTestParam> so that the
  queue mode becomes part of the test parameter.
- Adding getMultiQueueParam(), getAllocParam() and getQueueFlags() helpers
  to the fixture for clean access to the two parts of the parameter tuple.
  getMultiQueueParam() calls the base class getter using the fully qualified
  name uur::urContextTestWithParam<...>::getParam() to avoid ambiguity.
  The names are specific enough to not be confused with GoogleTest's
  conventional GetParam().
- Creating queues with the parameterized flag via ur_queue_properties_t
  instead of a hardcoded UR_QUEUE_FLAG_SUBMISSION_BATCHED.
- Switching the test suite macro from UUR_DEVICE_TEST_SUITE_WITH_PARAM
  to UUR_MULTI_QUEUE_TYPE_TEST_SUITE_WITH_PARAM and the printer to
  deviceTestWithParamPrinterMulti, which expands the suite to cover
  both queue modes automatically.
- Updating all three test bodies (SuccessMt, SuccessReuse,
  SuccessDependantMt) to use getAllocParam() instead of
  std::get<1>(this->GetParam()), and restoring the numQueues parameter
  in SuccessMt to getAllocParam().numQueues.

This ensures both batched and immediate queues are covered by default
test runs without requiring UR_L0_V2_FORCE_BATCHED=1.

Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
---
 .../adapters/level_zero/enqueue_alloc.cpp     | 57 ++++++++++++-------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp b/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp
index c322a14768186..b2c1128b770ae 100644
--- a/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp
+++ b/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp
@@ -73,16 +73,19 @@ std::ostream &operator<<(std::ostream &os,
 }
 
 struct urL0EnqueueAllocMultiQueueSameDeviceTest
-    : uur::urContextTestWithParam<EnqueueAllocMultiQueueTestParam> {
+    : uur::urContextTestWithParam<
+          uur::MultiQueueParam<EnqueueAllocMultiQueueTestParam>> {
   void SetUp() override {
     UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp());
-    auto param = std::get<1>(this->GetParam());
 
+    const auto &param = getAllocParam();
+
+    ur_queue_properties_t props = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
+                                   getQueueFlags()};
     queues.reserve(param.numQueues);
     for (size_t i = 0; i < param.numQueues; i++) {
       ur_queue_handle_t queue = nullptr;
-      ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue));
-      SKIP_IF_BATCHED_QUEUE(queue);
+      ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue));
       queues.push_back(queue);
     }
   }
@@ -95,6 +98,20 @@ struct urL0EnqueueAllocMultiQueueSameDeviceTest
     UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown());
   }
 
+  const uur::MultiQueueParam<EnqueueAllocMultiQueueTestParam> &
+  getMultiQueueParam() const {
+    return uur::urContextTestWithParam<
+        uur::MultiQueueParam<EnqueueAllocMultiQueueTestParam>>::getParam();
+  }
+
+  const EnqueueAllocMultiQueueTestParam &getAllocParam() const {
+    return std::get<0>(this->getMultiQueueParam());
+  }
+
+  ur_queue_flag_t getQueueFlags() const {
+    return std::get<1>(this->getMultiQueueParam());
+  }
+
   std::vector<ur_queue_handle_t> queues;
 };
 
@@ -322,7 +339,7 @@ TEST_P(urL0EnqueueAllocTest, SuccessWithKernelRepeat) {
   ValidateEnqueueFree(ptr2);
 }
 
-UUR_DEVICE_TEST_SUITE_WITH_PARAM(
+UUR_MULTI_QUEUE_TYPE_TEST_SUITE_WITH_PARAM(
     urL0EnqueueAllocMultiQueueSameDeviceTest,
     ::testing::ValuesIn({
         EnqueueAllocMultiQueueTestParam{1024, 256, 8, urEnqueueUSMHostAllocExp,
@@ -334,20 +351,16 @@ UUR_DEVICE_TEST_SUITE_WITH_PARAM(
                                         urEnqueueUSMDeviceAllocExp,
                                         uur::GetDeviceUSMDeviceSupport},
     }),
-    uur::deviceTestWithParamPrinter<EnqueueAllocMultiQueueTestParam>);
+    uur::deviceTestWithParamPrinterMulti<EnqueueAllocMultiQueueTestParam>);
 
 TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessMt) {
-  const size_t allocSize = std::get<1>(this->GetParam()).allocSize;
-  const size_t numQueues = std::get<1>(this->GetParam()).numQueues;
-  const size_t iterations = std::get<1>(this->GetParam()).iterations;
+  const size_t allocSize = getAllocParam().allocSize;
+  const size_t numQueues = getAllocParam().numQueues;
+  const size_t iterations = getAllocParam().iterations;
   const auto enqueueUSMAllocFunc =
-      std::get<1>(this->GetParam()).funcParams.enqueueUSMAllocFunc;
+      getAllocParam().funcParams.enqueueUSMAllocFunc;
   const auto checkUSMSupportFunc =
-      std::get<1>(this->GetParam()).funcParams.checkUSMSupportFunc;
-
-  if (numQueues > 0) {
-    SKIP_IF_BATCHED_QUEUE(queues[0]);
-  }
+      getAllocParam().funcParams.checkUSMSupportFunc;
 
   ur_device_usm_access_capability_flags_t USMSupport = 0;
   ASSERT_SUCCESS(checkUSMSupportFunc(device, USMSupport));
@@ -394,11 +407,11 @@ TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessMt) {
 TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessReuse) {
   GTEST_SKIP() << "Multi queue reuse is not supported.";
 
-  const size_t allocSize = std::get<1>(this->GetParam()).allocSize;
+  const size_t allocSize = getAllocParam().allocSize;
   const auto enqueueUSMAllocFunc =
-      std::get<1>(this->GetParam()).funcParams.enqueueUSMAllocFunc;
+      getAllocParam().funcParams.enqueueUSMAllocFunc;
   const auto checkUSMSupportFunc =
-      std::get<1>(this->GetParam()).funcParams.checkUSMSupportFunc;
+      getAllocParam().funcParams.checkUSMSupportFunc;
 
   ur_device_usm_access_capability_flags_t USMSupport = 0;
   ASSERT_SUCCESS(checkUSMSupportFunc(device, USMSupport));
@@ -457,12 +470,12 @@ TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessReuse) {
 }
 
 TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessDependantMt) {
-  const size_t allocSize = std::get<1>(this->GetParam()).allocSize;
-  const size_t iterations = std::get<1>(this->GetParam()).iterations;
+  const size_t allocSize = getAllocParam().allocSize;
+  const size_t iterations = getAllocParam().iterations;
   const auto enqueueUSMAllocFunc =
-      std::get<1>(this->GetParam()).funcParams.enqueueUSMAllocFunc;
+      getAllocParam().funcParams.enqueueUSMAllocFunc;
   const auto checkUSMSupportFunc =
-      std::get<1>(this->GetParam()).funcParams.checkUSMSupportFunc;
+      getAllocParam().funcParams.checkUSMSupportFunc;
 
   ur_device_usm_access_capability_flags_t USMSupport = 0;
   ASSERT_SUCCESS(checkUSMSupportFunc(device, USMSupport));