diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 8bba9a4c08be0..996190e71dae0 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -203,7 +203,7 @@ jobs: python3 $GITHUB_WORKSPACE/src/buildbot/configure.py -w $GITHUB_WORKSPACE \ -s $GITHUB_WORKSPACE/src -o $GITHUB_WORKSPACE/build \ -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/toolchain \ - -t Release \ + -t Debug \ --ci-defaults --use-zstd ${{ inputs.build_configure_extra_args }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index f9ed0e8734f05..c30331bf24a6f 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -102,7 +102,7 @@ jobs: target_devices: native_cpu:cpu sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ extra_lit_opts: --param sycl_build_targets="native_cpu" - extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON + extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON -DCMAKE_BUILD_TYPE="Debug" # If a PR changes CUDA adapter, run the build on Ubuntu 22.04 as well. # Ubuntu 22.04 container has CUDA 12.1 installed while Ubuntu 24.0 image diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index f85de29439dbf..b8d96f366d0fc 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -262,6 +262,31 @@ void context_impl::addDeviceGlobalInitializer( } } +void context_impl::removeDeviceGlobalInitializer( + ur_program_handle_t Program, const RTDeviceBinaryImage *BinImage) { + std::lock_guard Lock(MDeviceGlobalInitializersMutex); + + for (auto It = MDeviceGlobalInitializers.begin(); + It != MDeviceGlobalInitializers.end();) { + const bool ProgramMatches = It->first.first == Program; + const bool ImageMatches = !BinImage || It->second.MBinImage == BinImage; + + if (!ProgramMatches || !ImageMatches) { + ++It; + continue; + } + + { + std::lock_guard InitLock(It->second.MDeviceGlobalInitMutex); + It->second.ClearEvents(getAdapter()); + } + if (!It->second.MDeviceGlobalsFullyInitialized) + --MDeviceGlobalNotInitializedCnt; + + It = MDeviceGlobalInitializers.erase(It); + } +} + std::vector context_impl::initializeDeviceGlobals( ur_program_handle_t NativePrg, queue_impl &QueueImpl, detail::kernel_bundle_impl *KernelBundleImplPtr) { diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index f86627e95d56a..1130ddf64dbfb 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -194,6 +194,12 @@ class context_impl : public std::enable_shared_from_this { devices_range Devs, const RTDeviceBinaryImage *BinImage); + /// Removes device global initializers for a program. If BinImage is not + /// null, only initializers associated with that image are removed. + void + removeDeviceGlobalInitializer(ur_program_handle_t Program, + const RTDeviceBinaryImage *BinImage = nullptr); + /// Initializes device globals for a program on the associated queue. std::vector initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl, diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index f2f060871b41b..e89c41a029d48 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1826,6 +1826,7 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { auto CurIt = It++; if (CurIt->second.second == Img) { if (auto ContextImpl = CurIt->second.first.lock()) { + ContextImpl->removeDeviceGlobalInitializer(CurIt->first, Img); ContextImpl->getKernelProgramCache().removeAllRelatedEntries( Img->getImageID()); } diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index 7c14538dbe7e3..d3ee0678fe73f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -80,7 +80,7 @@ int test_device_global() { auto checkVal = [&](int32_t expected) { val = -1; q.memcpy(&val, dgAddr, dgSize).wait(); - std::cout << "val: " << val << " == " << expected << '\n'; + std::cout << "val: " << val << " == " << expected << std::endl; assert(val == expected); }; @@ -162,9 +162,65 @@ int test_error() { return 0; } +int test_device_global_min() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + sycl::device d = q.get_device(); + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, DGSource); + + exe_kb kbExe1 = syclex::build(kbSrc); + auto addK = kbExe1.ext_oneapi_get_kernel("ff_dg_adder"); + + void *dgAddr = kbExe1.ext_oneapi_get_device_global_address("DG", d); + size_t dgSize = kbExe1.ext_oneapi_get_device_global_size("DG"); + assert(dgSize == 4); + + int32_t val; + auto checkVal = [&](int32_t expected) { + val = -1; + q.memcpy(&val, dgAddr, dgSize).wait(); + std::cout << "val: " << val << " == " << expected << std::endl; + assert(val == expected); + }; + + // Device globals are zero-initialized. + checkVal(0); + + // Set the DG. + val = 123; + q.memcpy(dgAddr, &val, dgSize).wait(); + checkVal(123); + + return 0; +} + +#ifndef MCR_TEST_COUNT +#define MCR_TEST_COUNT 25 +#endif + int main() { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER - return test_device_global() || test_error(); + constexpr std::size_t testCount{MCR_TEST_COUNT}; + std::size_t testIteration{1}, failCount{}; + int constexpr OK = 0; + + for (; testIteration <= testCount; ++testIteration) { + std::cout << "Test iteration: " << testIteration << " / " << testCount; + std::cout << ", Failed iterations: " << failCount << std::endl; + + if (test_device_global_min() != OK) { + ++failCount; + std::cout << "FAILED" << std::endl; + } + } + + return failCount || test_error(); #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index b7aad444ea127..ffd15f0babcf9 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -1167,7 +1167,7 @@ def get_sycl_ls_verbose(sycl_device, env): config.substitutions.append(("%clangxx", " true ")) config.substitutions.append(("%clang", " true ")) else: - clangxx = " " + config.dpcpp_compiler + " -Werror " + clangxx = " " + config.dpcpp_compiler + " -Werror -O0 -g " if "preview-mode" in config.available_features: # Technically, `-fpreview-breaking-changes` is reported as unused option # if used without `-fsycl`. However, we have far less tests compiling @@ -1198,7 +1198,7 @@ def get_sycl_ls_verbose(sycl_device, env): import psutil if config.test_mode == "run-only": - lit_config.maxIndividualTestTime = 300 + lit_config.maxIndividualTestTime = 3000 else: lit_config.maxIndividualTestTime = 600