HILA
Loading...
Searching...
No Matches
gpu_templated_ops.h
1#ifndef GPU_TEMPLATED_OPS_H
2#define GPU_TEMPLATED_OPS_H
3
4#include "plumbing/defs.h"
5#include "plumbing/backend_gpu/defs.h"
6
7// this include has to be after the backend defs, because those define hila::random()
8#include "plumbing/random.h"
9
10#include "plumbing/type_tools.h"
11
12// #if defined(__CUDACC__) || defined(__HIPCC__)
13
14#if !defined(HILAPP)
15
16/* Reduction */
17/*
18template<typename T>
19T cuda_reduce_sum( T * vector, int N ){
20 static bool initialized = false;
21 static T * d_sum;
22 static void *d_temp_storage;
23 static size_t temp_storage_size;
24
25 T sum;
26
27 if( !initialized ) {
28 cudaMalloc( &d_sum, sizeof(T) );
29 d_temp_storage = NULL;
30 temp_storage_size = 0;
31 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
32 initialized = true;
33 }
34
35 // Allocate temporary storage
36 cudaMalloc(&d_temp_storage, temp_storage_size);
37
38 // Run sum-reduction
39 cub::DeviceReduce::Sum(d_temp_storage, temp_storage_size, vector, d_sum, N);
40
41 cudaMemcpy( &sum, d_sum, sizeof(T), cudaMemcpyDeviceToHost );
42
43 return sum;
44}
45*/
46
47// Functions used by hilapp in reductions
48// (cannot guarantee operator= and operator+= are marked __device__)
49
50template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
51__device__ static void _hila_kernel_set_zero(T &v) {
52 v = 0;
53}
54
55template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
56__device__ static void _hila_kernel_set_zero(T &v) {
57 using ntype = hila::arithmetic_type<T>;
58 constexpr int N = sizeof(T) / sizeof(ntype);
59
60 ntype *arr = reinterpret_cast<ntype *>(&v);
61 for (int i = 0; i < N; i++)
62 arr[i] = 0;
63}
64
65template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
66__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
67 a = b;
68}
69
70template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
71__device__ static void _hila_kernel_copy_var(T &a, const T &b) {
72 using ntype = hila::arithmetic_type<T>;
73 constexpr int N = sizeof(T) / sizeof(ntype);
74
75 ntype *ar = reinterpret_cast<ntype *>(&a);
76 const ntype *br = reinterpret_cast<const ntype *>(&b);
77
78 for (int i = 0; i < N; i++)
79 ar[i] = br[i];
80}
81
82template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value, int> = 0>
83__device__ static void _hila_kernel_add_var(T &a, const T &b) {
84 a += b;
85}
86
87template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value, int> = 0>
88__device__ static void _hila_kernel_add_var(T &a, const T &b) {
89 using ntype = hila::arithmetic_type<T>;
90 constexpr int N = sizeof(T) / sizeof(ntype);
91
92 ntype *ar = reinterpret_cast<ntype *>(&a);
93 const ntype *br = reinterpret_cast<const ntype *>(&b);
94
95 for (int i = 0; i < N; i++)
96 ar[i] += br[i];
97}
98
99
100////////////////////////////////////////////////////////////
101
102// A simple hand-written reduction that does not require a library
103template <typename T>
104__global__ void gpu_reduce_sum_kernel(T *vector, int vector_size, int new_size, int elems) {
105 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
106 if (Index < new_size) {
107 for (int i = 1; i < elems; i++) {
108 int ind = Index + i * new_size;
109 // vector[Index] += vector[ind];
110 _hila_kernel_add_var(vector[Index], vector[ind]);
111 }
112 }
113}
114
115template <typename T>
116T gpu_reduce_sum(T *vector, int N) {
117 const int reduce_step = 32;
118 T sum = 0;
119 T *host_vector = (T *)memalloc(N * sizeof(T));
120 int vector_size = N;
121
122 while (vector_size > reduce_step) {
123 // Take the last n elements that are divisible by reduce_step
124 int first = vector_size % reduce_step;
125 // Calculate the size of the reduced list
126 int new_size = (vector_size - first) / reduce_step;
127 // Find number of blocks and launch the kernel
128 int blocks = (new_size - 1) / N_threads + 1;
129 gpu_reduce_sum_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
130 reduce_step);
131 check_device_error("gpu_reduce_sum kernel");
132 // Find the full size of the resulting array
133 vector_size = new_size + first;
134 gpuDeviceSynchronize();
135 }
136 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
137
138 for (int i = 0; i < vector_size; i++) {
139 sum += host_vector[i];
140 }
141
142 free(host_vector);
143
144 return sum;
145}
146
147template <typename T>
148__global__ void gpu_reduce_product_kernel(T *vector, int vector_size, int new_size, int elems) {
149 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
150 if (Index < new_size) {
151 for (int i = 1; i < elems; i++) {
152 int ind = Index + i * new_size;
153 vector[Index] *= vector[ind];
154 }
155 }
156}
157
158template <typename T>
159T gpu_reduce_product(T *vector, int N) {
160 const int reduce_step = 32;
161 T prod;
162 prod = 1;
163 T *host_vector = (T *)malloc(N * sizeof(T));
164 int vector_size = N;
165 while (vector_size > reduce_step) {
166 // Take the last n elements that are divisible by reduce_step
167 int first = vector_size % reduce_step;
168 // Calculate the size of the reduced list
169 int new_size = (vector_size - first) / reduce_step;
170 // Find number of blocks and launch the kernel
171 int blocks = new_size / N_threads + 1;
172 gpu_reduce_product_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
173 reduce_step);
174
175 // Find the full size of the resulting array
176 vector_size = new_size + first;
177
178 gpuDeviceSynchronize();
179 }
180
181 check_device_error("gpu_reduce_product kernel");
182
183 gpuMemcpy(host_vector, vector, vector_size * sizeof(T), gpuMemcpyDeviceToHost);
184
185 for (int i = 0; i < vector_size; i++) {
186 prod *= host_vector[i];
187 }
188
189 free(host_vector);
190 return prod;
191}
192
193#if 0
194
195template <typename T>
196void gpu_multireduce_sum(std::vector<T> &vector, T *d_array, int N) {
197 for (int v = 0; v < vector.size(); v++) {
198 vector[v] += gpu_reduce_sum(d_array + v * N, N);
199 }
200}
201
202template <typename T>
203void gpu_multireduce_product(std::vector<T> vector, T *d_array, int N) {
204 for (int v = 0; v < vector.size(); v++) {
205 vector[v] += gpu_reduce_product(d_array + v * N, N);
206 }
207}
208
209#endif
210
211/// Implement here atomicAdd for double precision for less than 6.0 capability
212/// motivated by the cuda toolkit documentation.
213/// atomicCAS(long long *a, long long b, long long v)
214/// does the operation *a = (*a == b ? v : *a), i.e. assigns v to *a if *a == b,
215/// atomically. Returns *a.
216
217#if __CUDA_ARCH__ < 600
218
219__device__ inline double atomic_Add(double *dp, double v) {
220
221 unsigned long long int *dp_ull = (unsigned long long int *)dp;
222 unsigned long long int old = *dp_ull;
223 unsigned long long int av;
224
225 do {
226 av = old;
227 old = atomicCAS(dp_ull, av, __double_as_longlong(v + __longlong_as_double(av)));
228
229 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
230 } while (av != old);
231
232 return __longlong_as_double(old);
233}
234
235#else
236__device__ inline double atomic_Add(double *dp, double v) {
237 return atomicAdd(dp, v);
238}
239
240#endif
241
242/// Float atomicAdd should exist
243
244__device__ inline float atomic_Add(float *dp, float v) {
245 return atomicAdd(dp, v);
246}
247
248
249/// Do addition for "long long" -sized int type, which in cuda is 64 bits
250/// Cuda includes a ULL atomicAdd, which is used here. Signed
251/// version can be obtained just by casting to ULL values, and
252/// using ULL atomicAdd - this gives the correct signed addition
253/// with "wraparound" overflow behaviour
254
255template <typename T, typename B,
256 std::enable_if_t<std::is_integral<T>::value && sizeof(T) == sizeof(unsigned long long) &&
257 std::is_convertible<B, T>::value,
258 int> = 0>
259__device__ inline T atomicAdd(T *dp, B v) {
260
261 T tv = v;
262 return atomicAdd((unsigned long long int *)dp, (unsigned long long int)tv);
263}
264
265/// Atomic add for composed datatypes - do element-by-element
266/// requires that hila::arithmetic_type is defined
267template <
268 typename T, typename B,
269 std::enable_if_t<!std::is_arithmetic<T>::value && std::is_convertible<B, T>::value, int> = 0>
270__device__ inline void atomicAdd(T *d, const B &bv) {
271
272 T v = bv;
273 hila::arithmetic_type<T> *dp;
274 const hila::arithmetic_type<T> *dv;
275 constexpr int N = sizeof(T) / sizeof(hila::arithmetic_type<T>);
276
277 dp = (hila::arithmetic_type<T> *)(void *)d;
278 dv = (hila::arithmetic_type<T> *)(void *)&v;
279
280 for (int i = 0; i < N; ++i) {
281 atomic_Add(dp + i, dv[i]);
282 }
283}
284
285/// Multiply 2 doubles atomically
286__device__ inline double atomicMultiply(double *dp, double v) {
287
288 unsigned long long int *dp_ull = (unsigned long long int *)dp;
289 unsigned long long int old = *dp_ull;
290 unsigned long long int av;
291
292 do {
293 av = old;
294 old = atomicCAS(dp_ull, av, __double_as_longlong(v * __longlong_as_double(av)));
295
296 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
297 } while (av != old);
298
299 return __longlong_as_double(old);
300}
301
302/// Multiply 2 floats atomically
303__device__ inline float atomicMultiply(float *dp, float v) {
304 unsigned int *dp_ui = (unsigned int *)dp;
305 unsigned int old = *dp_ui;
306 unsigned int av;
307
308 do {
309 av = old;
310 old = atomicCAS(dp_ui, av, __float_as_int(v * __int_as_float(av)));
311
312 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
313 } while (av != old);
314
315 return __int_as_float(old);
316}
317
318///////////////////////////////////////////////////////////////////////////////
319
320// NOTE: IF YOU DEFINE ALT_VECTOR_REDUCTION YOU NEED TO DEFINE THE SAME IN
321// codegen_gpu.cpp!
322
323// #define ALT_VECTOR_REDUCTION
324
325template <typename T>
326__global__ void sum_blocked_vectorreduction_kernel(T *D, const int reduction_size,
327 const int threads) {
328 int id = threadIdx.x + blockIdx.x * blockDim.x;
329
330 T sum;
331
332#ifndef ALT_VECTOR_REDUCTION
333 if (id < reduction_size) {
334 // id is now the reduction coordinate
335 sum = D[id];
336 for (int i = 1; i < threads; i++) {
337 // add everything to zero
338 sum += D[id + i * reduction_size];
339 }
340 D[id] = sum;
341 }
342
343#else
344 if (id < reduction_size) {
345 // id is now the reduction coordinate
346 sum = D[id * threads];
347 for (int i = 1; i < threads; i++) {
348 sum += D[id * threads + i];
349 }
350 D[id * threads] = sum;
351 }
352#endif
353}
354
355#ifdef ALT_VECTOR_REDUCTION
356
357template <typename T>
358__global__ void sum_blocked_vectorreduction_k2(T *D, const int reduction_size, const int threads) {
359
360 int id = threadIdx.x + blockIdx.x * blockDim.x;
361 if (id < threads) {
362 for (; id < reduction_size; id += threads) {
363 D[id] = D[id * threads];
364 }
365 }
366}
367
368#endif
369
370template <typename T>
371void sum_blocked_vectorreduction(T *data, const int reduction_size, const int threads) {
372
373 // straightforward implementation, use as many threads as elements in reduction vector
374
375 int blocks = (reduction_size + N_threads - 1) / N_threads;
376
377 sum_blocked_vectorreduction_kernel<<<blocks, N_threads>>>(data, reduction_size, threads);
378
379#ifdef ALT_VECTOR_REDUCTION
380
381 sum_blocked_vectorreduction_k2<<<1, N_threads>>>(data, reduction_size, threads);
382#endif
383
384 check_device_error("sum_blocked_vectorreduction");
385}
386
387
388///////////////////////
389
390template <typename T>
391__global__ void gpu_set_value_kernel(T *vector, T value, int elems) {
392 int Index = threadIdx.x + blockIdx.x * blockDim.x;
393 if (Index < elems) {
394 vector[Index] = value;
395 }
396}
397
398// template <typename T>
399// __global__ void gpu_set_zero_kernel(T *vector, int elems) {
400// unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
401// if (Index < elems) {
402// vector[Index] = 0;
403// }
404// }
405
406
407/// use memset to set value to zero - not useful for other values
408template <typename T>
409inline void gpu_set_zero(T *vec, size_t N) {
410 gpuMemset(vec, 0, N * sizeof(T));
411}
412
413/// Setting to some value needs to be done elem-by-elem
414template <typename T>
415void gpu_set_value(T *vec, const T &val, size_t N) {
416 int blocks = N / N_threads + 1;
417 gpu_set_value_kernel<<<blocks, N_threads>>>(vec, val, N);
418}
419
420// template <typename T>
421// void gpu_set_zero(T *vec, size_t N) {
422// int blocks = N / N_threads + 1;
423// gpu_set_zero_kernel<<<blocks, N_threads>>>(vec, N);
424// }
425
426
427#endif // !HILAPP
428
429#endif
This file defines all includes for HILA.