HILA
Loading...
Searching...
No Matches
gpu_templated_ops.h
1#ifndef GPU_TEMPLATED_OPS_H
2#define GPU_TEMPLATED_OPS_H
3
4#include "plumbing/defs.h"
5#include "plumbing/backend_gpu/defs.h"
6
7// this include has to be after the backend defs, because those define hila::random()
8#include "plumbing/random.h"
9
10#include "plumbing/type_tools.h"
11
12// #if defined(__CUDACC__) || defined(__HIPCC__)
13
14#if !defined(HILAPP)
15
16/* Reduction */
17/*
18template<typename T>
19T cuda_reduce_sum( T * vector, int N ){
20 static bool initialized = false;
21 static T * d_sum;
22 static void *d_temp_storage;
23 static size_t temp_storage_size;
24
25 T sum;
26
27 if( !initialized ) {
28 cudaMalloc( &d_sum, sizeof(T) );
29 d_temp_storage = NULL;
30 temp_storage_size = 0;
31 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
32 initialized = true;
33 }
34
35 // Allocate temporary storage
36 cudaMalloc(&d_temp_storage, temp_storage_size);
37
38 // Run sum-reduction
39 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
40
41 cudaMemcpy( &sum, d_sum, sizeof(T), cudaMemcpyDeviceToHost );
42
43 return sum;
44}
45*/
46
47// Functions used by hilapp in reductions
48// (cannot guarantee operator= and operator+= are marked __device__)
49
50template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
51__device__ static void _hila_kernel_set_zero(T &v) {
52 v = 0;
53}
54
55template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
56__device__ static void _hila_kernel_set_zero(T &v) {
57 using ntype = hila::arithmetic_type<T>;
58 constexpr int N = sizeof(T) / sizeof(ntype);
59
60 ntype *arr = reinterpret_cast<ntype *>(&v);
61 for (int i = 0; i < N; i++)
62 arr[i] = 0;
63}
64
65template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
66__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
67 a = b;
68}
69
70template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
71__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
72 using ntype = hila::arithmetic_type<T>;
73 constexpr int N = sizeof(T) / sizeof(ntype);
74
75 ntype *ar = reinterpret_cast<ntype *>(&a);
76 const ntype *br = reinterpret_cast<const ntype *>(&b);
77
78 for (int i = 0; i < N; i++)
79 ar[i] = br[i];
80}
81
82template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
83__device__ static void _hila_kernel_add_var(T &a, const T &b) {
84 a += b;
85}
86
87template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
88__device__ static void _hila_kernel_add_var(T &a, const T &b) {
89 using ntype = hila::arithmetic_type<T>;
90 constexpr int N = sizeof(T) / sizeof(ntype);
91
92 ntype *ar = reinterpret_cast<ntype *>(&a);
93 const ntype *br = reinterpret_cast<const ntype *>(&b);
94
95 for (int i = 0; i < N; i++)
96 ar[i] += br[i];
97}
98
99
100////////////////////////////////////////////////////////////
101
102// A simple hand-written reduction that does not require a library
103template <typename T>
104__global__ void gpu_reduce_sum_kernel(T *vector, int vector_size, int new_size, int elems) {
105 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
106 if (Index < new_size) {
107 for (int i = 1; i < elems; i++) {
108 int ind = Index + i * new_size;
109 // vector[Index] += vector[ind];
110 _hila_kernel_add_var(vector[Index], vector[ind]);
111 }
112 }
113}
114
115template <typename T>
116T gpu_reduce_sum(T *vector, int N) {
117 const int reduce_step = 32;
118 T sum = 0;
119 T *host_vector = (T *)memalloc(N * sizeof(T));
120 int vector_size = N;
121
122 while (vector_size > reduce_step) {
123 // Take the last n elements that are divisible by reduce_step
124 int first = vector_size % reduce_step;
125 // Calculate the size of the reduced list
126 int new_size = (vector_size - first) / reduce_step;
127 // Find number of blocks and launch the kernel
128 int blocks = (new_size - 1) / N_threads + 1;
129 gpu_reduce_sum_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
130 reduce_step);
131 check_device_error("gpu_reduce_sum kernel");
132 // Find the full size of the resulting array
133 vector_size = new_size + first;
134 gpuDeviceSynchronize();
135 }
136 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
137
138 for (int i = 0; i < vector_size; i++) {
139 sum += host_vector[i];
140 }
141
142 free(host_vector);
143
144 return sum;
145}
146
147template <typename T>
148__global__ void gpu_reduce_product_kernel(T *vector, int vector_size, int new_size, int elems) {
149 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
150 if (Index < new_size) {
151 for (int i = 1; i < elems; i++) {
152 int ind = Index + i * new_size;
153 vector[Index] *= vector[ind];
154 }
155 }
156}
157
158template <typename T>
159T gpu_reduce_product(T *vector, int N) {
160 const int reduce_step = 32;
161 T prod;
162 prod = 1;
163 T *host_vector = (T *)malloc(N * sizeof(T));
164 int vector_size = N;
165 while (vector_size > reduce_step) {
166 // Take the last n elements that are divisible by reduce_step
167 int first = vector_size % reduce_step;
168 // Calculate the size of the reduced list
169 int new_size = (vector_size - first) / reduce_step;
170 // Find number of blocks and launch the kernel
171 int blocks = new_size / N_threads + 1;
172 gpu_reduce_product_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
173 reduce_step);
174
175 // Find the full size of the resulting array
176 vector_size = new_size + first;
177
178 gpuDeviceSynchronize();
179 }
180
181 check_device_error("gpu_reduce_product kernel");
182
183 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
184
185 for (int i = 0; i < vector_size; i++) {
186 prod *= host_vector[i];
187 }
188
189 free(host_vector);
190 return prod;
191}
192
193#if 0
194
195template <typename T>
196void gpu_multireduce_sum(std::vector<T> &vector, T *d_array, int N) {
197 for (int v = 0; v < vector.size(); v++) {
198 vector[v] += gpu_reduce_sum(d_array + v * N, N);
199 }
200}
201
202template <typename T>
203void gpu_multireduce_product(std::vector<T> vector, T *d_array, int N) {
204 for (int v = 0; v < vector.size(); v++) {
205 vector[v] += gpu_reduce_product(d_array + v * N, N);
206 }
207}
208
209#endif
210
211/// Implement here atomicAdd for double precision for less than 6.0 capability
212/// motivated by the cuda toolkit documentation.
213/// atomicCAS(long long *a, long long b, long long v)
214/// does the operation *a = (*a == b ? v : *a), i.e. assigns v to *a if *a == b,
215/// atomically. Returns *a.
216
217#if __CUDA_ARCH__ < 600
218
219__device__ inline double atomic_Add(double *dp, double v) {
220
221 unsigned long long int *dp_ull = (unsigned long long int *)dp;
222 unsigned long long int old = *dp_ull;
223 unsigned long long int av;
224
225 do {
226 av = old;
227 old = atomicCAS(dp_ull, av, __double_as_longlong(v + __longlong_as_double(av)));
228
229 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
230 } while (av != old);
231
232 return __longlong_as_double(old);
233}
234
235#else
236__device__ inline double atomic_Add(double *dp, double v) {
237 return atomicAdd(dp, v);
238}
239
240#endif
241
242/// Float atomicAdd should exist
243
244__device__ inline float atomic_Add(float *dp, float v) {
245 return atomicAdd(dp, v);
246}
247
248
249/// Do addition for "long long" -sized int type, which in cuda is 64 bits
250/// Cuda includes a ULL atomicAdd, which is used here. Signed
251/// version can be obtained just by casting to ULL values, and
252/// using ULL atomicAdd - this gives the correct signed addition
253/// with "wraparound" overflow behaviour
254
255template <typename T, typename B,
256 std::enable_if_t<std::is_integral<T>::value && sizeof(T) == sizeof(unsigned long long) &&
257 std::is_convertible<B, T>::value,
258 int> = 0>
259__device__ inline T atomicAdd(T *dp, B v) {
260
261 T tv = v;
262 return atomicAdd((unsigned long long int *)dp, (unsigned long long int)tv);
263}
264
265/// Atomic add for composed datatypes - do element-by-element
266/// requires that hila::arithmetic_type is defined
267template <
268 typename T, typename B,
269 std::enable_if_t<!std::is_arithmetic<T>::value && std::is_assignable<T &, T>::value, int> = 0>
270__device__ inline void atomicAdd(T *d, const B &bv) {
271
272 T v;
273 v = bv;
274 hila::arithmetic_type<T> *dp;
275 const hila::arithmetic_type<T> *dv;
276 constexpr int N = sizeof(T) / sizeof(hila::arithmetic_type<T>);
277
278 dp = (hila::arithmetic_type<T> *)(void *)d;
279 dv = (hila::arithmetic_type<T> *)(void *)&v;
280
281 for (int i = 0; i < N; ++i) {
282 atomic_Add(dp + i, dv[i]);
283 }
284}
285
286/// Multiply 2 doubles atomically
287__device__ inline double atomicMultiply(double *dp, double v) {
288
289 unsigned long long int *dp_ull = (unsigned long long int *)dp;
290 unsigned long long int old = *dp_ull;
291 unsigned long long int av;
292
293 do {
294 av = old;
295 old = atomicCAS(dp_ull, av, __double_as_longlong(v * __longlong_as_double(av)));
296
297 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
298 } while (av != old);
299
300 return __longlong_as_double(old);
301}
302
303/// Multiply 2 floats atomically
304__device__ inline float atomicMultiply(float *dp, float v) {
305 unsigned int *dp_ui = (unsigned int *)dp;
306 unsigned int old = *dp_ui;
307 unsigned int av;
308
309 do {
310 av = old;
311 old = atomicCAS(dp_ui, av, __float_as_int(v * __int_as_float(av)));
312
313 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
314 } while (av != old);
315
316 return __int_as_float(old);
317}
318
319///////////////////////////////////////////////////////////////////////////////
320
321// NOTE: IF YOU DEFINE ALT_VECTOR_REDUCTION YOU NEED TO DEFINE THE SAME IN
322// codegen_gpu.cpp!
323
324// #define ALT_VECTOR_REDUCTION
325
326template <typename T>
327__global__ void sum_blocked_vectorreduction_kernel(T *D, const int reduction_size,
328 const int threads) {
329 int id = threadIdx.x + blockIdx.x * blockDim.x;
330
331 T sum;
332
333#ifndef ALT_VECTOR_REDUCTION
334 if (id < reduction_size) {
335 // id is now the reduction coordinate
336 sum = D[id];
337 for (int i = 1; i < threads; i++) {
338 // add everything to zero
339 sum += D[id + i * reduction_size];
340 }
341 D[id] = sum;
342 }
343
344#else
345 if (id < reduction_size) {
346 // id is now the reduction coordinate
347 sum = D[id * threads];
348 for (int i = 1; i < threads; i++) {
349 sum += D[id * threads + i];
350 }
351 D[id * threads] = sum;
352 }
353#endif
354}
355
356#ifdef ALT_VECTOR_REDUCTION
357
358template <typename T>
359__global__ void sum_blocked_vectorreduction_k2(T *D, const int reduction_size, const int threads) {
360
361 int id = threadIdx.x + blockIdx.x * blockDim.x;
362 if (id < threads) {
363 for (; id < reduction_size; id += threads) {
364 D[id] = D[id * threads];
365 }
366 }
367}
368
369#endif
370
371template <typename T>
372void sum_blocked_vectorreduction(T *data, const int reduction_size, const int threads) {
373
374 // straightforward implementation, use as many threads as elements in reduction vector
375
376 int blocks = (reduction_size + N_threads - 1) / N_threads;
377
378 sum_blocked_vectorreduction_kernel<<<blocks, N_threads>>>(data, reduction_size, threads);
379
380#ifdef ALT_VECTOR_REDUCTION
381
382 sum_blocked_vectorreduction_k2<<<1, N_threads>>>(data, reduction_size, threads);
383#endif
384
385 check_device_error("sum_blocked_vectorreduction");
386}
387
388
389///////////////////////
390
391template <typename T>
392__global__ void gpu_set_value_kernel(T *vector, T value, int elems) {
393 int Index = threadIdx.x + blockIdx.x * blockDim.x;
394 if (Index < elems) {
395 vector[Index] = value;
396 }
397}
398
399// passing a ptr instead of value directly
400template <typename T>
401__global__ void gpu_set_value_kernel_ptr(T *vector, const T* valptr, int elems) {
402 int Index = threadIdx.x + blockIdx.x * blockDim.x;
403 if (Index < elems) {
404 vector[Index] = *valptr;
405 }
406}
407
408
409// template <typename T>
410// __global__ void gpu_set_zero_kernel(T *vector, int elems) {
411// unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
412// if (Index < elems) {
413// vector[Index] = 0;
414// }
415// }
416
417
418/// use memset to set value to zero - not useful for other values
419template <typename T>
420inline void gpu_set_zero(T *vec, size_t N) {
421 gpuMemset(vec, 0, N * sizeof(T));
422}
423
424/// Setting to some value needs to be done elem-by-elem
425template <typename T>
426void gpu_set_value(T *vec, const T &val, size_t N) {
427 int blocks = N / N_threads + 1;
428 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE)
429 // small T size, pass as arg to __global__
430 gpu_set_value_kernel<<<blocks, N_threads>>>(vec, val, N);
431 else {
432 // bigger size, memcopy
433 T *buf;
434 gpuMalloc(&buf, sizeof(T));
435 gpuMemcpy(buf, (char *)&val, sizeof(T), gpuMemcpyHostToDevice);
436
437 // call the kernel to set correct indexes
438 gpu_set_value_kernel_ptr<<<blocks, N_threads>>>(vec, buf, N);
439 gpuFree(buf);
440 }
441}
442
443// template <typename T>
444// void gpu_set_zero(T *vec, size_t N) {
445// int blocks = N / N_threads + 1;
446// gpu_set_zero_kernel<<<blocks, N_threads>>>(vec, N);
447// }
448
449
450#endif // !HILAPP
451
452#endif
This file defines all includes for HILA.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
Definition params.h:177
#define N_threads
General number of threads in a thread block.
Definition params.h:189