HILA
Loading...
Searching...
No Matches
gpu_templated_ops.h
1#ifndef HILA_GPU_TEMPLATED_OPS_H
2#define HILA_GPU_TEMPLATED_OPS_H
3
4#include "plumbing/defs.h"
5#include "plumbing/backend_gpu/defs.h"
6
7// this include has to be after the backend defs, because those define hila::random()
8#include "plumbing/random.h"
9
10#include "plumbing/type_tools.h"
11
12// #if defined(__CUDACC__) || defined(__HIPCC__)
13
14#if !defined(HILAPP)
15#if defined(CUDA)
16#include <cub/cub.cuh>
17namespace gpucub = cub;
18using gpuStream_t = cudaStream_t;
19
20#endif
21
22// Ensure no #pragma directives are embedded within macro arguments in the code.
23
24#if defined(HIP)
25#include <hipcub/hipcub.hpp>
26namespace gpucub = hipcub;
27using gpuStream_t = hipStream_t;
28#endif
29
30/* Reduction */
31/*
32template<typename T>
33T cuda_reduce_sum( T * vector, int N ){
34 static bool initialized = false;
35 static T * d_sum;
36 static void *d_temp_storage;
37 static size_t temp_storage_size;
38
39 T sum;
40
41 if( !initialized ) {
42 cudaMalloc( &d_sum, sizeof(T) );
43 d_temp_storage = NULL;
44 temp_storage_size = 0;
45 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
46 initialized = true;
47 }
48
49 // Allocate temporary storage
50 cudaMalloc(&d_temp_storage, temp_storage_size);
51
52 // Run sum-reduction
53 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
54
55 cudaMemcpy( &sum, d_sum, sizeof(T), cudaMemcpyDeviceToHost );
56
57 return sum;
58}
59*/
60
61// Functions used by hilapp in reductions
62// (cannot guarantee operator= and operator+= are marked __device__)
63
64template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
65__device__ static void _hila_kernel_set_zero(T &v) {
66 v = 0;
67}
68
69template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
70__device__ static void _hila_kernel_set_zero(T &v) {
71 using ntype = hila::arithmetic_type<T>;
72 constexpr int N = sizeof(T) / sizeof(ntype);
73
74 ntype *arr = reinterpret_cast<ntype *>(&v);
75 for (int i = 0; i < N; i++)
76 arr[i] = 0;
77}
78
79template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
80__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
81 a = b;
82}
83
84template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
85__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
86 using ntype = hila::arithmetic_type<T>;
87 constexpr int N = sizeof(T) / sizeof(ntype);
88
89 ntype *ar = reinterpret_cast<ntype *>(&a);
90 const ntype *br = reinterpret_cast<const ntype *>(&b);
91
92 for (int i = 0; i < N; i++)
93 ar[i] = br[i];
94}
95
96template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
97__device__ static void _hila_kernel_add_var(T &a, const T &b) {
98 a += b;
99}
100
101template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
102__device__ static void _hila_kernel_add_var(T &a, const T &b) {
103 using ntype = hila::arithmetic_type<T>;
104 constexpr int N = sizeof(T) / sizeof(ntype);
105
106 ntype *ar = reinterpret_cast<ntype *>(&a);
107 const ntype *br = reinterpret_cast<const ntype *>(&b);
108
109 for (int i = 0; i < N; i++)
110 ar[i] += br[i];
111}
112
113
114////////////////////////////////////////////////////////////
115
116// A simple hand-written reduction that does not require a library
117template <typename T>
118__global__ void gpu_reduce_sum_kernel(T *vector, int vector_size, int new_size, int elems) {
119 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
120 if (Index < new_size) {
121 for (int i = 1; i < elems; i++) {
122 int ind = Index + i * new_size;
123 // vector[Index] += vector[ind];
124 _hila_kernel_add_var(vector[Index], vector[ind]);
125 }
126 }
127}
128
129template <typename T>
130T gpu_reduce_sum(T *vector, int N) {
131 const int reduce_step = 32;
132 T sum = 0;
133 T *host_vector = (T *)memalloc(N * sizeof(T));
134 int vector_size = N;
135
136 while (vector_size > reduce_step) {
137 // Take the last n elements that are divisible by reduce_step
138 int first = vector_size % reduce_step;
139 // Calculate the size of the reduced list
140 int new_size = (vector_size - first) / reduce_step;
141 // Find number of blocks and launch the kernel
142 int blocks = (new_size - 1) / N_threads + 1;
143 gpu_reduce_sum_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
144 reduce_step);
145 check_device_error("gpu_reduce_sum kernel");
146 // Find the full size of the resulting array
147 vector_size = new_size + first;
148 gpuDeviceSynchronize();
149 }
150 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
151
152 for (int i = 0; i < vector_size; i++) {
153 sum += host_vector[i];
154 }
155
156 free(host_vector);
157
158 return sum;
159}
160
161template <typename T>
162__global__ void gpu_reduce_product_kernel(T *vector, int vector_size, int new_size, int elems) {
163 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
164 if (Index < new_size) {
165 for (int i = 1; i < elems; i++) {
166 int ind = Index + i * new_size;
167 vector[Index] *= vector[ind];
168 }
169 }
170}
171
172template <typename T>
173T gpu_reduce_product(T *vector, int N) {
174 const int reduce_step = 32;
175 T prod;
176 prod = 1;
177 T *host_vector = (T *)malloc(N * sizeof(T));
178 int vector_size = N;
179 while (vector_size > reduce_step) {
180 // Take the last n elements that are divisible by reduce_step
181 int first = vector_size % reduce_step;
182 // Calculate the size of the reduced list
183 int new_size = (vector_size - first) / reduce_step;
184 // Find number of blocks and launch the kernel
185 int blocks = new_size / N_threads + 1;
186 gpu_reduce_product_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
187 reduce_step);
188
189 // Find the full size of the resulting array
190 vector_size = new_size + first;
191
192 gpuDeviceSynchronize();
193 }
194
195 check_device_error("gpu_reduce_product kernel");
196
197 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
198
199 for (int i = 0; i < vector_size; i++) {
200 prod *= host_vector[i];
201 }
202
203 free(host_vector);
204 return prod;
205}
206
207#if 0
208
209template <typename T>
210void gpu_multireduce_sum(std::vector<T> &vector, T *d_array, int N) {
211 for (int v = 0; v < vector.size(); v++) {
212 vector[v] += gpu_reduce_sum(d_array + v * N, N);
213 }
214}
215
216template <typename T>
217void gpu_multireduce_product(std::vector<T> vector, T *d_array, int N) {
218 for (int v = 0; v < vector.size(); v++) {
219 vector[v] += gpu_reduce_product(d_array + v * N, N);
220 }
221}
222
223#endif
224
225/// Implement here atomicAdd for double precision for less than 6.0 capability
226/// motivated by the cuda toolkit documentation.
227/// atomicCAS(long long *a, long long b, long long v)
228/// does the operation *a = (*a == b ? v : *a), i.e. assigns v to *a if *a == b,
229/// atomically. Returns *a.
230
231#if __CUDA_ARCH__ < 600
232
233__device__ inline double atomic_Add(double *dp, double v) {
234
235 unsigned long long int *dp_ull = (unsigned long long int *)dp;
236 unsigned long long int old = *dp_ull;
237 unsigned long long int av;
238
239 do {
240 av = old;
241 old = atomicCAS(dp_ull, av, __double_as_longlong(v + __longlong_as_double(av)));
242
243 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
244 } while (av != old);
245
246 return __longlong_as_double(old);
247}
248
249#else
250__device__ inline double atomic_Add(double *dp, double v) {
251 return atomicAdd(dp, v);
252}
253
254#endif
255
256/// Float atomicAdd should exist
257
258__device__ inline float atomic_Add(float *dp, float v) {
259 return atomicAdd(dp, v);
260}
261
262
263/// Do addition for "long long" -sized int type, which in cuda is 64 bits
264/// Cuda includes a ULL atomicAdd, which is used here. Signed
265/// version can be obtained just by casting to ULL values, and
266/// using ULL atomicAdd - this gives the correct signed addition
267/// with "wraparound" overflow behaviour
268
269template <typename T, typename B,
270 std::enable_if_t<std::is_integral<T>::value && sizeof(T) == sizeof(unsigned long long) &&
271 std::is_convertible<B, T>::value,
272 int> = 0>
273__device__ inline T atomicAdd(T *dp, B v) {
274
275 T tv = v;
276 return atomicAdd((unsigned long long int *)dp, (unsigned long long int)tv);
277}
278
279/// Atomic add for composed datatypes - do element-by-element
280/// requires that hila::arithmetic_type is defined
281template <
282 typename T, typename B,
283 std::enable_if_t<!std::is_arithmetic<T>::value && std::is_assignable<T &, T>::value, int> = 0>
284__device__ inline void atomicAdd(T *d, const B &bv) {
285
286 T v;
287 v = bv;
288 hila::arithmetic_type<T> *dp;
289 const hila::arithmetic_type<T> *dv;
290 constexpr int N = sizeof(T) / sizeof(hila::arithmetic_type<T>);
291
292 dp = (hila::arithmetic_type<T> *)(void *)d;
293 dv = (hila::arithmetic_type<T> *)(void *)&v;
294
295 for (int i = 0; i < N; ++i) {
296 atomic_Add(dp + i, dv[i]);
297 }
298}
299
300/// Multiply 2 doubles atomically
301__device__ inline double atomicMultiply(double *dp, double v) {
302
303 unsigned long long int *dp_ull = (unsigned long long int *)dp;
304 unsigned long long int old = *dp_ull;
305 unsigned long long int av;
306
307 do {
308 av = old;
309 old = atomicCAS(dp_ull, av, __double_as_longlong(v * __longlong_as_double(av)));
310
311 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
312 } while (av != old);
313
314 return __longlong_as_double(old);
315}
316
317/// Multiply 2 floats atomically
318__device__ inline float atomicMultiply(float *dp, float v) {
319 unsigned int *dp_ui = (unsigned int *)dp;
320 unsigned int old = *dp_ui;
321 unsigned int av;
322
323 do {
324 av = old;
325 old = atomicCAS(dp_ui, av, __float_as_int(v * __int_as_float(av)));
326
327 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
328 } while (av != old);
329
330 return __int_as_float(old);
331}
332
333///////////////////////////////////////////////////////////////////////////////
334
335/**
336 * @brief Vector reduction kernel using CUB block reduce
337 * @details Reduces 8192 chunk subarrays in input data and outupts them to output array of size
338 * reduction vector
339 *
340 * @tparam T Lattice data type
341 * @param D input data of size 8192*reduction_size
342 * @param output output array of size reduction_size
343 * @param reduction_size reduction vector length
344 * @param threads max number of threads which is 8192
345 */
346template <typename T, int BLOCK_SIZE>
347__global__ void sum_blocked_vectorreduction_kernel_cub(T *D, T *output, int reduction_size,
348 int threads) {
349 using BlockReduce = gpucub::BlockReduce<T, BLOCK_SIZE>;
350 __shared__ typename BlockReduce::TempStorage temp_storage;
351
352 int global_thread_id = blockIdx.x * threads + threadIdx.x;
353 int stride = BLOCK_SIZE;
354 T local_sum;
355 _hila_kernel_set_zero(local_sum);
356
357 for (int i = 0; i < max(1, (threads + stride - 1) / stride); i++) {
358 int index = global_thread_id + i * stride;
359 if (index < reduction_size * threads) {
360 _hila_kernel_add_var(local_sum, D[index]);
361 }
362 }
363
364 T total_sum;
365 total_sum = BlockReduce(temp_storage).Sum(local_sum);
366 if (threadIdx.x == 0) {
367 _hila_kernel_copy_var(output[blockIdx.x], total_sum);
368 }
369}
370
371/**
372 * @brief host function that calls device kernel sum_blocked_vectorredcutin_kernel_cub
373 * @details If one runs out of shared memory or resources for kernel launch it is due to the type T
374 * requiring too many resources. One can change GPU_BLOCK_REDUCTION_THREADS to a smaller number at
375 * the cost of a slight performance loss.
376 *
377 * @tparam T Lattice data type
378 * @param D input data of size 8192*reduction_size
379 * @param output output array of size reduction_size
380 * @param reduction_size reduction vector length
381 * @param threads max number of threads which is 8192
382 */
383template <typename T>
384void sum_blocked_vectorreduction(T *data, T *output, const int reduction_size, const int threads) {
385 T *output_device;
386 gpuMalloc(&output_device, reduction_size * sizeof(T));
387 sum_blocked_vectorreduction_kernel_cub<T, GPU_BLOCK_REDUCTION_THREADS>
388 <<<reduction_size, GPU_BLOCK_REDUCTION_THREADS>>>(data, output_device, reduction_size,
389 threads);
390
391 hila::synchronize_threads();
392 gpuMemcpy(output, output_device, reduction_size * sizeof(T), gpuMemcpyDeviceToHost);
393 gpuFree(output_device);
394 check_device_error("sum_blocked_vectorreduction");
395}
396
397
398///////////////////////
399
400template <typename T>
401__global__ void gpu_set_value_kernel(T *vector, T value, int elems) {
402 int Index = threadIdx.x + blockIdx.x * blockDim.x;
403 if (Index < elems) {
404 vector[Index] = value;
405 }
406}
407
408// passing a ptr instead of value directly
409template <typename T>
410__global__ void gpu_set_value_kernel_ptr(T *vector, const T *valptr, int elems) {
411 int Index = threadIdx.x + blockIdx.x * blockDim.x;
412 if (Index < elems) {
413 vector[Index] = *valptr;
414 }
415}
416
417
418// template <typename T>
419// __global__ void gpu_set_zero_kernel(T *vector, int elems) {
420// unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
421// if (Index < elems) {
422// vector[Index] = 0;
423// }
424// }
425
426
427/// use memset to set value to zero - not useful for other values
428template <typename T>
429inline void gpu_set_zero(T *vec, size_t N) {
430 gpuMemset(vec, 0, N * sizeof(T));
431}
432
433/// Setting to some value needs to be done elem-by-elem
434template <typename T>
435void gpu_set_value(T *vec, const T &val, size_t N) {
436 int blocks = N / N_threads + 1;
437 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE)
438 // small T size, pass as arg to __global__
439 gpu_set_value_kernel<<<blocks, N_threads>>>(vec, val, N);
440 else {
441 // bigger size, memcopy
442 T *buf;
443 gpuMalloc(&buf, sizeof(T));
444 gpuMemcpy(buf, (char *)&val, sizeof(T), gpuMemcpyHostToDevice);
445
446 // call the kernel to set correct indexes
447 gpu_set_value_kernel_ptr<<<blocks, N_threads>>>(vec, buf, N);
448 gpuFree(buf);
449 }
450}
451
452// template <typename T>
453// void gpu_set_zero(T *vec, size_t N) {
454// int blocks = N / N_threads + 1;
455// gpu_set_zero_kernel<<<blocks, N_threads>>>(vec, N);
456// }
457
458#endif // !HILAPP
459
460/// Just copy the array to GPU memory, returning its device address
461template <typename T>
462T * copy_array_to_gpu(T *data, int n) {
463 T * buf;
464 gpuMalloc(&buf,n*sizeof(T));
465 gpuMemcpy(buf, data, n * sizeof(T), gpuMemcpyHostToDevice);
466 return buf;
467}
468
469
470
471#endif
This file defines all includes for HILA.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
Definition params.h:200
#define N_threads
General number of threads in a thread block.
Definition params.h:212
#define GPU_BLOCK_REDUCTION_THREADS
Definition params.h:152