From 19e0c2e7aed430bd3f29132cdad97e5149264748 Mon Sep 17 00:00:00 2001 From: Krzysztof Weronski Date: Mon, 30 Mar 2026 15:24:43 +0200 Subject: [PATCH 1/4] Fix stale device global initializers on RTC program reuse Prevent stale MDeviceGlobalInitializers entries from colliding with new entries when program handles are reused in RuntimeCompiled kernel loops. Implement explicit cleanup via removeDeviceGlobalInitializer() during program teardown. Update test case to run multiple iterations to catch this error in the future. Replace end line escape character with std::endl to force buffer flushing, as otherwise the error message during a reproduction may not be printed on the screen. --- sycl/source/detail/context_impl.cpp | 25 +++++++++++++++++++ sycl/source/detail/context_impl.hpp | 6 +++++ .../program_manager/program_manager.cpp | 1 + .../KernelCompiler/sycl_device_globals.cpp | 22 ++++++++++++++-- 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index f85de29439dbf..b8d96f366d0fc 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -262,6 +262,31 @@ void context_impl::addDeviceGlobalInitializer( } } +void context_impl::removeDeviceGlobalInitializer( + ur_program_handle_t Program, const RTDeviceBinaryImage *BinImage) { + std::lock_guard Lock(MDeviceGlobalInitializersMutex); + + for (auto It = MDeviceGlobalInitializers.begin(); + It != MDeviceGlobalInitializers.end();) { + const bool ProgramMatches = It->first.first == Program; + const bool ImageMatches = !BinImage || It->second.MBinImage == BinImage; + + if (!ProgramMatches || !ImageMatches) { + ++It; + continue; + } + + { + std::lock_guard InitLock(It->second.MDeviceGlobalInitMutex); + It->second.ClearEvents(getAdapter()); + } + if (!It->second.MDeviceGlobalsFullyInitialized) + --MDeviceGlobalNotInitializedCnt; + + It = MDeviceGlobalInitializers.erase(It); + } +} + std::vector context_impl::initializeDeviceGlobals( ur_program_handle_t NativePrg, queue_impl &QueueImpl, detail::kernel_bundle_impl *KernelBundleImplPtr) { diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index f86627e95d56a..1130ddf64dbfb 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -194,6 +194,12 @@ class context_impl : public std::enable_shared_from_this { devices_range Devs, const RTDeviceBinaryImage *BinImage); + /// Removes device global initializers for a program. If BinImage is not + /// null, only initializers associated with that image are removed. + void + removeDeviceGlobalInitializer(ur_program_handle_t Program, + const RTDeviceBinaryImage *BinImage = nullptr); + /// Initializes device globals for a program on the associated queue. std::vector initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl, diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index f2f060871b41b..e89c41a029d48 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1826,6 +1826,7 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { auto CurIt = It++; if (CurIt->second.second == Img) { if (auto ContextImpl = CurIt->second.first.lock()) { + ContextImpl->removeDeviceGlobalInitializer(CurIt->first, Img); ContextImpl->getKernelProgramCache().removeAllRelatedEntries( Img->getImageID()); } diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index 7c14538dbe7e3..f611e9c51e821 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -80,7 +80,7 @@ int test_device_global() { auto checkVal = [&](int32_t expected) { val = -1; q.memcpy(&val, dgAddr, dgSize).wait(); - std::cout << "val: " << val << " == " << expected << '\n'; + std::cout << "val: " << val << " == " << expected << std::endl; assert(val == expected); }; @@ -162,9 +162,27 @@ int test_error() { return 0; } +#ifndef MCR_TEST_COUNT +#define MCR_TEST_COUNT 25 +#endif + int main() { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER - return test_device_global() || test_error(); + constexpr std::size_t testCount{MCR_TEST_COUNT}; + std::size_t testIteration{1}, failCount{}; + int constexpr OK = 0; + + for (; testIteration <= testCount; ++testIteration) { + std::cout << "Test iteration: " << testIteration << " / " << testCount; + std::cout << ", Failed iterations: " << failCount << std::endl; + + if (test_device_global() != OK) { + ++failCount; + std::cout << "FAILED" << std::endl; + } + } + + return failCount || test_error(); #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif From 9d47b2555011284de34412e8ea13dc43073efe14 Mon Sep 17 00:00:00 2001 From: Krzysztof Weronski Date: Wed, 1 Apr 2026 14:50:32 +0200 Subject: [PATCH 2/4] Test in debug mode --- .github/workflows/sycl-linux-build.yml | 2 +- .github/workflows/sycl-linux-precommit.yml | 2 +- sycl/test-e2e/lit.cfg.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 8bba9a4c08be0..996190e71dae0 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -203,7 +203,7 @@ jobs: python3 $GITHUB_WORKSPACE/src/buildbot/configure.py -w $GITHUB_WORKSPACE \ -s $GITHUB_WORKSPACE/src -o $GITHUB_WORKSPACE/build \ -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/toolchain \ - -t Release \ + -t Debug \ --ci-defaults --use-zstd ${{ inputs.build_configure_extra_args }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index f9ed0e8734f05..c30331bf24a6f 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -102,7 +102,7 @@ jobs: target_devices: native_cpu:cpu sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ extra_lit_opts: --param sycl_build_targets="native_cpu" - extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON + extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON -DCMAKE_BUILD_TYPE="Debug" # If a PR changes CUDA adapter, run the build on Ubuntu 22.04 as well. # Ubuntu 22.04 container has CUDA 12.1 installed while Ubuntu 24.0 image diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index b7aad444ea127..1201cbed45e04 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -1167,7 +1167,7 @@ def get_sycl_ls_verbose(sycl_device, env): config.substitutions.append(("%clangxx", " true ")) config.substitutions.append(("%clang", " true ")) else: - clangxx = " " + config.dpcpp_compiler + " -Werror " + clangxx = " " + config.dpcpp_compiler + " -Werror -O0 -g " if "preview-mode" in config.available_features: # Technically, `-fpreview-breaking-changes` is reported as unused option # if used without `-fsycl`. However, we have far less tests compiling From ebda68afdefb015d035d055fa6bebfbe731afb59 Mon Sep 17 00:00:00 2001 From: Krzysztof Weronski Date: Thu, 2 Apr 2026 15:11:50 +0200 Subject: [PATCH 3/4] Add a minimal reproducible example --- .../KernelCompiler/sycl_device_globals.cpp | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index f611e9c51e821..d3ee0678fe73f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -162,6 +162,44 @@ int test_error() { return 0; } +int test_device_global_min() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + sycl::device d = q.get_device(); + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, DGSource); + + exe_kb kbExe1 = syclex::build(kbSrc); + auto addK = kbExe1.ext_oneapi_get_kernel("ff_dg_adder"); + + void *dgAddr = kbExe1.ext_oneapi_get_device_global_address("DG", d); + size_t dgSize = kbExe1.ext_oneapi_get_device_global_size("DG"); + assert(dgSize == 4); + + int32_t val; + auto checkVal = [&](int32_t expected) { + val = -1; + q.memcpy(&val, dgAddr, dgSize).wait(); + std::cout << "val: " << val << " == " << expected << std::endl; + assert(val == expected); + }; + + // Device globals are zero-initialized. + checkVal(0); + + // Set the DG. + val = 123; + q.memcpy(dgAddr, &val, dgSize).wait(); + checkVal(123); + + return 0; +} + #ifndef MCR_TEST_COUNT #define MCR_TEST_COUNT 25 #endif @@ -176,7 +214,7 @@ int main() { std::cout << "Test iteration: " << testIteration << " / " << testCount; std::cout << ", Failed iterations: " << failCount << std::endl; - if (test_device_global() != OK) { + if (test_device_global_min() != OK) { ++failCount; std::cout << "FAILED" << std::endl; } From 95b1251e2fee51b811c70eade8c739c1f1468771 Mon Sep 17 00:00:00 2001 From: Krzysztof Weronski Date: Fri, 3 Apr 2026 10:23:38 +0200 Subject: [PATCH 4/4] Extend test timeout --- sycl/test-e2e/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 1201cbed45e04..ffd15f0babcf9 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -1198,7 +1198,7 @@ def get_sycl_ls_verbose(sycl_device, env): import psutil if config.test_mode == "run-only": - lit_config.maxIndividualTestTime = 300 + lit_config.maxIndividualTestTime = 3000 else: lit_config.maxIndividualTestTime = 600