Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions llvm/lib/SYCLPostLink/ComputeModuleRuntimeInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,19 @@ PropSetRegTy computeModuleProperties(const Module &M,
/*PropVal=*/true);
}
}

// Export device_global variables.
for (auto &GV : M.globals()) {
if (!isDeviceGlobalVariable(GV))
continue;
if (GV.isDeclaration()) // Skip declarations.
continue;
if (hasDeviceImageScopeProperty(GV)) // Skip per-image globals.
continue;
if (GV.hasExternalLinkage()) {
PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, GV.getName(), true);
}
}
}
if (GlobProps.EmitKernelNames) {
for (const auto *F : EntryPoints) {
Expand Down Expand Up @@ -226,6 +239,26 @@ PropSetRegTy computeModuleProperties(const Module &M,
/*PropVal=*/true);
}
}

// Check for imported device_global variables.
for (auto &GV : M.globals()) {
if (!GV.isDeclaration())
continue;
if (!GV.hasExternalLinkage())
continue;

// Check if it's a device_global by type name (declarations don't have
// attributes).
std::string TypeName;
raw_string_ostream(TypeName) << *GV.getValueType();

if (TypeName.find("device_global") == std::string::npos)
continue;

if (AllowDeviceImageDependencies) {
PropSet.add(PropSetRegTy::SYCL_IMPORTED_SYMBOLS, GV.getName(), true);
}
}
}

// Metadata names may be composite so we keep them alive until the
Expand Down
9 changes: 7 additions & 2 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,17 +265,22 @@ void context_impl::addDeviceGlobalInitializer(
std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
ur_program_handle_t NativePrg, queue_impl &QueueImpl,
detail::kernel_bundle_impl *KernelBundleImplPtr) {

if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire))
return {};

detail::adapter_impl &Adapter = getAdapter();
device_impl &DeviceImpl = QueueImpl.getDeviceImpl();
std::lock_guard<std::mutex> NativeProgramLock(MDeviceGlobalInitializersMutex);

auto ImgIt = MDeviceGlobalInitializers.find(
std::make_pair(NativePrg, DeviceImpl.getHandleRef()));
if (ImgIt == MDeviceGlobalInitializers.end() ||
ImgIt->second.MDeviceGlobalsFullyInitialized)
if (ImgIt == MDeviceGlobalInitializers.end()) {
return {};
}
if (ImgIt->second.MDeviceGlobalsFullyInitialized) {
return {};
}

DeviceGlobalInitializer &InitRef = ImgIt->second;
{
Expand Down
24 changes: 23 additions & 1 deletion sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,7 @@ ProgramManager::getBuiltURProgram(const BinImgWithDeps &ImgWithDeps,

// Those extra programs won't be used anymore, just the final
// linked result:
bool WasLinked = !ProgramsToLink.empty();
ProgramsToLink.clear();
emitBuiltProgramInfo(BuiltProgram, ContextImpl);

Expand All @@ -940,7 +941,28 @@ ProgramManager::getBuiltURProgram(const BinImgWithDeps &ImgWithDeps,
}
}

ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, &MainImg);
// If we linked multiple images, we need to register device_globals from
// all of them, not just the main image. Create a merged binary image.
if (WasLinked && ImgWithDeps.getAll().size() > 1) {
auto MergedImg =
std::make_unique<DynRTDeviceBinaryImage>(ImgWithDeps.getAll());
const RTDeviceBinaryImage *MergedImgPtr = MergedImg.get();

// Store the merged image to keep it alive. Use a static map since we need
// it to persist for the lifetime of the program.
static std::mutex MergedImagesMutex;
static std::map<ur_program_handle_t,
std::unique_ptr<DynRTDeviceBinaryImage>>
MergedImages;
{
std::lock_guard<std::mutex> Lock(MergedImagesMutex);
MergedImages[BuiltProgram] = std::move(MergedImg);
}

ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, MergedImgPtr);
} else {
ContextImpl.addDeviceGlobalInitializer(BuiltProgram, Devs, &MainImg);
}

// Save program to persistent cache if it is not there
if (!DeviceCodeWasInCache) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <sycl/detail/core.hpp>
#include <sycl/usm.hpp>

#if defined(_WIN32)
#define API_EXPORT __declspec(dllexport)
#else
#define API_EXPORT
#endif

// Define and export the device_global
namespace syclex = sycl::ext::oneapi::experimental;
API_EXPORT SYCL_EXTERNAL syclex::device_global<int> test_global;

// Host function to set the device_global value
extern "C" API_EXPORT void set_test_global(int val) {
sycl::queue q;
q.copy(&val, test_global, 1, 0).wait();
}

// Host function to get the device_global value
extern "C" API_EXPORT int get_test_global() {
sycl::queue q;
int result = 0;
q.copy(test_global, &result, 1, 0).wait();
return result;
}

// Function that reads device_global in a kernel within the library
extern "C" API_EXPORT int read_global_in_lib() {
sycl::queue q;
int result = 0;
int *dev_result = sycl::malloc_device<int>(1, q);

q.submit([&](sycl::handler &h) {
h.single_task([=]() {
dev_result[0] = test_global; // Read in library's own kernel
});
}).wait();

q.copy(dev_result, &result, 1).wait();
sycl::free(dev_result, q);
return result;
}
101 changes: 101 additions & 0 deletions sycl/test-e2e/IntermediateLib/device_global_device_image_app.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// This test verifies that device_global variables work correctly across
// shared library boundaries when multiple device images are linked together.
// The fix ensures that device globals from all linked images are properly
// registered, not just those from the main image.

// UNSUPPORTED: opencl && gpu
// UNSUPPORTED-TRACKER: GSD-12574

// UNSUPPORTED: cuda, hip
// UNSUPPORTED-INTENDED: CUDA and HIP targets support AoT compilation only and
// cannot do runtime linking.

// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%}
// DEFINE: %{shared_lib_ext} = %if windows %{dll%} %else %{so%}
// DEFINE: %{cuda_target} = %if target-nvidia %{-fsycl-targets=nvptx64-nvidia-cuda%}
// DEFINE: %{amd_target} = %if target-amd %{-fsycl-targets=amdgcn-amd-amdhsa %amd_arch_options%}
// DEFINE: %{spir_target} = %if !target-nvidia && !target-amd %{-fsycl-targets=spir64%}
// DEFINE: %{lib_export_flags} = -ftarget-export-symbols -fsycl-allow-device-image-dependencies

// RUN: rm -rf %t.dir && mkdir -p %t.dir

// RUN: %{run-aux} %clangxx -fsycl %{cuda_target} %{amd_target} %{spir_target} \
// RUN: %{fPIC_flag} %{lib_export_flags} %shared_lib \
// RUN: -Wno-unused-command-line-argument \
// RUN: -o %t.dir/libdevice_global_test.%{shared_lib_ext} \
// RUN: %S/Inputs/device_global_device_image_lib.cpp

// RUN: %{run-aux} %clangxx -fsycl %{cuda_target} %{amd_target} %{spir_target} \
// RUN: %{fPIC_flag} %{lib_export_flags} \
// RUN: -Wno-unused-command-line-argument \
// RUN: -o %t.dir/test.exe %s \
// RUN: %if windows %{ %t.dir/libdevice_global_test.lib%} \
// RUN: %else %{-L%t.dir -ldevice_global_test -Wl,-rpath=%t.dir%}

// RUN: %{run} %t.dir/test.exe

// UNSUPPORTED: target-native_cpu
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20142

// clang-format off
/*
// build the shared library
clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -shared -o libdevice_global_test.dll ./Inputs/device_global_device_image_lib.cpp

// build the app - Lin and Win
clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -o test.bin device_global_device_image_app.cpp -L. -ldevice_global_test -Wl,-rpath=.
clang++ -fsycl -fsycl-targets=spir64 -ftarget-export-symbols -fsycl-allow-device-image-dependencies -o test.exe device_global_device_image_app.cpp libdevice_global_test.lib

// run
./test.bin
*/
// clang-format on

#include <iostream>
#include <sycl/detail/core.hpp>
#include <sycl/usm.hpp>

// Declare external symbols from library
namespace syclex = sycl::ext::oneapi::experimental;
extern __SYCL_EXPORT SYCL_EXTERNAL syclex::device_global<int> test_global;

extern "C" void set_test_global(int val);
extern "C" int get_test_global();
extern "C" int read_global_in_lib();

int main() {
std::cout << "\n=== Test 1: Copy operations from host ===\n";
set_test_global(42);
int val = get_test_global();
std::cout << "After set_test_global(42): get_test_global() = " << val
<< " (expected 42)\n";

std::cout << "\n=== Test 2: Read in library's kernel ===\n";
int lib_read = read_global_in_lib();
std::cout << "read_global_in_lib() = " << lib_read << " (expected 42)\n";

std::cout << "\n=== Test 3: Read in main's kernel ===\n";
sycl::queue q;
int *dev_result = sycl::malloc_device<int>(1, q);

q.submit([&](sycl::handler &h) {
h.single_task([=]() {
dev_result[0] =
test_global; // Read in main's kernel - this tests the fix
});
}).wait();

int main_read = 0;
q.copy(dev_result, &main_read, 1).wait();
sycl::free(dev_result, q);

std::cout << "main's kernel read: " << main_read << " (expected 42)\n";

if (val == 42 && lib_read == 42 && main_read == 42) {
std::cout << "\n✓ ALL TESTS PASSED\n";
return 0;
} else {
std::cout << "\n✗ TESTS FAILED\n";
return 1;
}
}
Loading