diff --git a/llvm/lib/SYCLPostLink/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLPostLink/ComputeModuleRuntimeInfo.cpp index 208525bb88663..7865e7f7e83a0 100644 --- a/llvm/lib/SYCLPostLink/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLPostLink/ComputeModuleRuntimeInfo.cpp @@ -192,6 +192,19 @@ PropSetRegTy computeModuleProperties(const Module &M, /*PropVal=*/true); } } + + // Export device_global variables. + for (auto &GV : M.globals()) { + if (!isDeviceGlobalVariable(GV)) + continue; + if (GV.isDeclaration()) // Skip declarations. + continue; + if (hasDeviceImageScopeProperty(GV)) // Skip per-image globals. + continue; + if (GV.hasExternalLinkage()) { + PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, GV.getName(), true); + } + } } if (GlobProps.EmitKernelNames) { for (const auto *F : EntryPoints) { @@ -226,6 +239,26 @@ PropSetRegTy computeModuleProperties(const Module &M, /*PropVal=*/true); } } + + // Check for imported device_global variables. + for (auto &GV : M.globals()) { + if (!GV.isDeclaration()) + continue; + if (!GV.hasExternalLinkage()) + continue; + + // Check if it's a device_global by type name (declarations don't have + // attributes). + std::string TypeName; + raw_string_ostream(TypeName) << *GV.getValueType(); + + if (TypeName.find("device_global") == std::string::npos) + continue; + + if (AllowDeviceImageDependencies) { + PropSet.add(PropSetRegTy::SYCL_IMPORTED_SYMBOLS, GV.getName(), true); + } + } } // Metadata names may be composite so we keep them alive until the diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index f85de29439dbf..60780c95524cf 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -265,17 +265,22 @@ void context_impl::addDeviceGlobalInitializer( std::vector context_impl::initializeDeviceGlobals( ur_program_handle_t NativePrg, queue_impl &QueueImpl, detail::kernel_bundle_impl *KernelBundleImplPtr) { + if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire)) return {}; detail::adapter_impl &Adapter = getAdapter(); device_impl &DeviceImpl = QueueImpl.getDeviceImpl(); std::lock_guard NativeProgramLock(MDeviceGlobalInitializersMutex); + auto ImgIt = MDeviceGlobalInitializers.find( std::make_pair(NativePrg, DeviceImpl.getHandleRef())); - if (ImgIt == MDeviceGlobalInitializers.end() || - ImgIt->second.MDeviceGlobalsFullyInitialized) + if (ImgIt == MDeviceGlobalInitializers.end()) { return {}; + } + if (ImgIt->second.MDeviceGlobalsFullyInitialized) { + return {}; + } DeviceGlobalInitializer &InitRef = ImgIt->second; { diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index f2f060871b41b..4c03ca9ac298e 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -924,6 +924,7 @@ ProgramManager::getBuiltURProgram(const BinImgWithDeps &ImgWithDeps, // Those extra programs won't be used anymore, just the final // linked result: + bool WasLinked = !ProgramsToLink.empty(); ProgramsToLink.clear(); emitBuiltProgramInfo(BuiltProgram, ContextImpl); @@ -940,7 +941,28 @@ ProgramManager::getBuiltURProgram(const BinImgWithDeps &ImgWithDeps, } } - ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, &MainImg); + // If we linked multiple images, we need to register device_globals from + // all of them, not just the main image. Create a merged binary image. + if (WasLinked && ImgWithDeps.getAll().size() > 1) { + auto MergedImg = + std::make_unique(ImgWithDeps.getAll()); + const RTDeviceBinaryImage *MergedImgPtr = MergedImg.get(); + + // Store the merged image to keep it alive. Use a static map since we need + // it to persist for the lifetime of the program. + static std::mutex MergedImagesMutex; + static std::map> + MergedImages; + { + std::lock_guard Lock(MergedImagesMutex); + MergedImages[BuiltProgram] = std::move(MergedImg); + } + + ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, MergedImgPtr); + } else { + ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, &MainImg); + } // Save program to persistent cache if it is not there if (!DeviceCodeWasInCache) { diff --git a/sycl/test-e2e/IntermediateLib/Inputs/device_global_device_image_lib.cpp b/sycl/test-e2e/IntermediateLib/Inputs/device_global_device_image_lib.cpp new file mode 100644 index 0000000000000..1d1fe58b55f87 --- /dev/null +++ b/sycl/test-e2e/IntermediateLib/Inputs/device_global_device_image_lib.cpp @@ -0,0 +1,43 @@ +#include +#include + +#if defined(_WIN32) +#define API_EXPORT __declspec(dllexport) +#else +#define API_EXPORT +#endif + +// Define and export the device_global +namespace syclex = sycl::ext::oneapi::experimental; +API_EXPORT SYCL_EXTERNAL syclex::device_global test_global; + +// Host function to set the device_global value +extern "C" API_EXPORT void set_test_global(int val) { + sycl::queue q; + q.copy(&val, test_global, 1, 0).wait(); +} + +// Host function to get the device_global value +extern "C" API_EXPORT int get_test_global() { + sycl::queue q; + int result = 0; + q.copy(test_global, &result, 1, 0).wait(); + return result; +} + +// Function that reads device_global in a kernel within the library +extern "C" API_EXPORT int read_global_in_lib() { + sycl::queue q; + int result = 0; + int *dev_result = sycl::malloc_device(1, q); + + q.submit([&](sycl::handler &h) { + h.single_task([=]() { + dev_result[0] = test_global; // Read in library's own kernel + }); + }).wait(); + + q.copy(dev_result, &result, 1).wait(); + sycl::free(dev_result, q); + return result; +} diff --git a/sycl/test-e2e/IntermediateLib/device_global_device_image_app.cpp b/sycl/test-e2e/IntermediateLib/device_global_device_image_app.cpp new file mode 100644 index 0000000000000..5443a093cb8b7 --- /dev/null +++ b/sycl/test-e2e/IntermediateLib/device_global_device_image_app.cpp @@ -0,0 +1,101 @@ +// This test verifies that device_global variables work correctly across +// shared library boundaries when multiple device images are linked together. +// The fix ensures that device globals from all linked images are properly +// registered, not just those from the main image. + +// UNSUPPORTED: opencl && gpu +// UNSUPPORTED-TRACKER: GSD-12574 + +// UNSUPPORTED: cuda, hip +// UNSUPPORTED-INTENDED: CUDA and HIP targets support AoT compilation only and +// cannot do runtime linking. + +// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%} +// DEFINE: %{shared_lib_ext} = %if windows %{dll%} %else %{so%} +// DEFINE: %{cuda_target} = %if target-nvidia %{-fsycl-targets=nvptx64-nvidia-cuda%} +// DEFINE: %{amd_target} = %if target-amd %{-fsycl-targets=amdgcn-amd-amdhsa %amd_arch_options%} +// DEFINE: %{spir_target} = %if !target-nvidia && !target-amd %{-fsycl-targets=spir64%} +// DEFINE: %{lib_export_flags} = -ftarget-export-symbols -fsycl-allow-device-image-dependencies + +// RUN: rm -rf %t.dir && mkdir -p %t.dir + +// RUN: %{run-aux} %clangxx -fsycl %{cuda_target} %{amd_target} %{spir_target} \ +// RUN: %{fPIC_flag} %{lib_export_flags} %shared_lib \ +// RUN: -Wno-unused-command-line-argument \ +// RUN: -o %t.dir/libdevice_global_test.%{shared_lib_ext} \ +// RUN: %S/Inputs/device_global_device_image_lib.cpp + +// RUN: %{run-aux} %clangxx -fsycl %{cuda_target} %{amd_target} %{spir_target} \ +// RUN: %{fPIC_flag} %{lib_export_flags} \ +// RUN: -Wno-unused-command-line-argument \ +// RUN: -o %t.dir/test.exe %s \ +// RUN: %if windows %{ %t.dir/libdevice_global_test.lib%} \ +// RUN: %else %{-L%t.dir -ldevice_global_test -Wl,-rpath=%t.dir%} + +// RUN: %{run} %t.dir/test.exe + +// UNSUPPORTED: target-native_cpu +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20142 + +// clang-format off +/* +// build the shared library +clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -shared -o libdevice_global_test.dll ./Inputs/device_global_device_image_lib.cpp + +// build the app - Lin and Win +clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -o test.bin device_global_device_image_app.cpp -L. -ldevice_global_test -Wl,-rpath=. +clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -o test.exe device_global_device_image_app.cpp libdevice_global_test.lib + +// run +./test.bin +*/ +// clang-format on + +#include +#include +#include + +// Declare external symbols from library +namespace syclex = sycl::ext::oneapi::experimental; +extern __SYCL_EXPORT SYCL_EXTERNAL syclex::device_global test_global; + +extern "C" void set_test_global(int val); +extern "C" int get_test_global(); +extern "C" int read_global_in_lib(); + +int main() { + std::cout << "\n=== Test 1: Copy operations from host ===\n"; + set_test_global(42); + int val = get_test_global(); + std::cout << "After set_test_global(42): get_test_global() = " << val + << " (expected 42)\n"; + + std::cout << "\n=== Test 2: Read in library's kernel ===\n"; + int lib_read = read_global_in_lib(); + std::cout << "read_global_in_lib() = " << lib_read << " (expected 42)\n"; + + std::cout << "\n=== Test 3: Read in main's kernel ===\n"; + sycl::queue q; + int *dev_result = sycl::malloc_device(1, q); + + q.submit([&](sycl::handler &h) { + h.single_task([=]() { + dev_result[0] = + test_global; // Read in main's kernel - this tests the fix + }); + }).wait(); + + int main_read = 0; + q.copy(dev_result, &main_read, 1).wait(); + sycl::free(dev_result, q); + + std::cout << "main's kernel read: " << main_read << " (expected 42)\n"; + + if (val == 42 && lib_read == 42 && main_read == 42) { + std::cout << "\n✓ ALL TESTS PASSED\n"; + return 0; + } else { + std::cout << "\n✗ TESTS FAILED\n"; + return 1; + } +}