-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsquare_array.cu
More file actions
80 lines (64 loc) · 2.46 KB
/
square_array.cu
File metadata and controls
80 lines (64 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include "square_array.h"
#include "square_op.h"
#include "debug.h"
#include <cuda_runtime.h>
#include <iostream>
#include <stdexcept>
__global__ void square_kernel(float* data, size_t n, float* sum) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
compute_and_accumulate(data, idx, sum);
}
}
void square_array(float* array, size_t size, float* result_sum, int device_id) {
float* device_array = nullptr;
float* device_sum = nullptr;
bool needs_copy_back = false;
float zero = 0.0f;
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, array);
bool is_device_ptr = false;
//#if CUDART_VERSION >= 10000
if (err == cudaSuccess && (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged)){
is_device_ptr = true;
DEBUG_PRINT("Array is on device: %d\n", attr.device);
}
//#else
// if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice)
// is_device_ptr = true;
//#endif
if (is_device_ptr) {
device_array = array;
// use the device ID of the pointer to set the device
cudaSetDevice(attr.device);
} else {
// set the device ID as specified
int device_count = 0;
cudaGetDeviceCount(&device_count);
if (device_id >= device_count) {
throw std::runtime_error("Invalid CUDA device ID: " + std::to_string(device_id));
}
DEBUG_PRINT("Copying array from host to device\n");
cudaSetDevice(device_id);
cudaMalloc(&device_array, size * sizeof(float));
cudaMemcpy(device_array, array, size * sizeof(float), cudaMemcpyHostToDevice);
needs_copy_back = true;
}
// get and print the current cuda device ID
int current_device_id;
cudaGetDevice(¤t_device_id);
DEBUG_PRINT("Using CUDA device: %d\n", current_device_id);
// Allocate memory for the sum on the device
cudaMalloc(&device_sum, sizeof(float));
cudaMemcpy(device_sum, &zero, sizeof(float), cudaMemcpyHostToDevice);
int threads = 256;
int blocks = (int)((size + threads - 1) / threads);
square_kernel<<<blocks, threads>>>(device_array, size, device_sum);
cudaDeviceSynchronize();
if (needs_copy_back) {
cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(device_array);
}
cudaMemcpy(result_sum, device_sum, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(device_sum);
}