1#ifndef GPU_TEMPLATED_OPS_H
2#define GPU_TEMPLATED_OPS_H
5#include "plumbing/backend_gpu/defs.h"
8#include "plumbing/random.h"
10#include "plumbing/type_tools.h"
50template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
51__device__
static void _hila_kernel_set_zero(T &v) {
55template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
56__device__
static void _hila_kernel_set_zero(T &v) {
57 using ntype = hila::arithmetic_type<T>;
58 constexpr int N =
sizeof(T) /
sizeof(ntype);
60 ntype *arr =
reinterpret_cast<ntype *
>(&v);
61 for (
int i = 0; i < N; i++)
65template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
66__device__
static void _hila_kernel_copy_var(T &a,
const T &b) {
70template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
71__device__
static void _hila_kernel_copy_var(T &a,
const T &b) {
72 using ntype = hila::arithmetic_type<T>;
73 constexpr int N =
sizeof(T) /
sizeof(ntype);
75 ntype *ar =
reinterpret_cast<ntype *
>(&a);
76 const ntype *br =
reinterpret_cast<const ntype *
>(&b);
78 for (
int i = 0; i < N; i++)
82template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
83__device__
static void _hila_kernel_add_var(T &a,
const T &b) {
87template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
88__device__
static void _hila_kernel_add_var(T &a,
const T &b) {
89 using ntype = hila::arithmetic_type<T>;
90 constexpr int N =
sizeof(T) /
sizeof(ntype);
92 ntype *ar =
reinterpret_cast<ntype *
>(&a);
93 const ntype *br =
reinterpret_cast<const ntype *
>(&b);
95 for (
int i = 0; i < N; i++)
104__global__
void gpu_reduce_sum_kernel(T *vector,
int vector_size,
int new_size,
int elems) {
105 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
106 if (Index < new_size) {
107 for (
int i = 1; i < elems; i++) {
108 int ind = Index + i * new_size;
110 _hila_kernel_add_var(vector[Index], vector[ind]);
116T gpu_reduce_sum(T *vector,
int N) {
117 const int reduce_step = 32;
119 T *host_vector = (T *)memalloc(N *
sizeof(T));
122 while (vector_size > reduce_step) {
124 int first = vector_size % reduce_step;
126 int new_size = (vector_size - first) / reduce_step;
128 int blocks = (new_size - 1) /
N_threads + 1;
129 gpu_reduce_sum_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
131 check_device_error(
"gpu_reduce_sum kernel");
133 vector_size = new_size + first;
134 gpuDeviceSynchronize();
136 gpuMemcpy(host_vector, vector, vector_size *
sizeof(T), gpuMemcpyDeviceToHost);
138 for (
int i = 0; i < vector_size; i++) {
139 sum += host_vector[i];
148__global__
void gpu_reduce_product_kernel(T *vector,
int vector_size,
int new_size,
int elems) {
149 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
150 if (Index < new_size) {
151 for (
int i = 1; i < elems; i++) {
152 int ind = Index + i * new_size;
153 vector[Index] *= vector[ind];
159T gpu_reduce_product(T *vector,
int N) {
160 const int reduce_step = 32;
163 T *host_vector = (T *)malloc(N *
sizeof(T));
165 while (vector_size > reduce_step) {
167 int first = vector_size % reduce_step;
169 int new_size = (vector_size - first) / reduce_step;
172 gpu_reduce_product_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
176 vector_size = new_size + first;
178 gpuDeviceSynchronize();
181 check_device_error(
"gpu_reduce_product kernel");
183 gpuMemcpy(host_vector, vector, vector_size *
sizeof(T), gpuMemcpyDeviceToHost);
185 for (
int i = 0; i < vector_size; i++) {
186 prod *= host_vector[i];
196void gpu_multireduce_sum(std::vector<T> &vector, T *d_array,
int N) {
197 for (
int v = 0; v < vector.size(); v++) {
198 vector[v] += gpu_reduce_sum(d_array + v * N, N);
203void gpu_multireduce_product(std::vector<T> vector, T *d_array,
int N) {
204 for (
int v = 0; v < vector.size(); v++) {
205 vector[v] += gpu_reduce_product(d_array + v * N, N);
217#if __CUDA_ARCH__ < 600
219__device__
inline double atomic_Add(
double *dp,
double v) {
221 unsigned long long int *dp_ull = (
unsigned long long int *)dp;
222 unsigned long long int old = *dp_ull;
223 unsigned long long int av;
227 old = atomicCAS(dp_ull, av, __double_as_longlong(v + __longlong_as_double(av)));
232 return __longlong_as_double(old);
236__device__
inline double atomic_Add(
double *dp,
double v) {
237 return atomicAdd(dp, v);
244__device__
inline float atomic_Add(
float *dp,
float v) {
245 return atomicAdd(dp, v);
255template <
typename T,
typename B,
256 std::enable_if_t<std::is_integral<T>::value &&
sizeof(T) ==
sizeof(
unsigned long long) &&
257 std::is_convertible<B, T>::value,
259__device__
inline T atomicAdd(T *dp, B v) {
262 return atomicAdd((
unsigned long long int *)dp, (
unsigned long long int)tv);
268 typename T,
typename B,
269 std::enable_if_t<!std::is_arithmetic<T>::value && std::is_assignable<T &, T>::value,
int> = 0>
270__device__
inline void atomicAdd(T *d,
const B &bv) {
274 hila::arithmetic_type<T> *dp;
275 const hila::arithmetic_type<T> *dv;
276 constexpr int N =
sizeof(T) /
sizeof(hila::arithmetic_type<T>);
278 dp = (hila::arithmetic_type<T> *)(
void *)d;
279 dv = (hila::arithmetic_type<T> *)(
void *)&v;
281 for (
int i = 0; i < N; ++i) {
282 atomic_Add(dp + i, dv[i]);
287__device__
inline double atomicMultiply(
double *dp,
double v) {
289 unsigned long long int *dp_ull = (
unsigned long long int *)dp;
290 unsigned long long int old = *dp_ull;
291 unsigned long long int av;
295 old = atomicCAS(dp_ull, av, __double_as_longlong(v * __longlong_as_double(av)));
300 return __longlong_as_double(old);
304__device__
inline float atomicMultiply(
float *dp,
float v) {
305 unsigned int *dp_ui = (
unsigned int *)dp;
306 unsigned int old = *dp_ui;
311 old = atomicCAS(dp_ui, av, __float_as_int(v * __int_as_float(av)));
316 return __int_as_float(old);
327__global__
void sum_blocked_vectorreduction_kernel(T *D,
const int reduction_size,
329 int id = threadIdx.x + blockIdx.x * blockDim.x;
333#ifndef ALT_VECTOR_REDUCTION
334 if (
id < reduction_size) {
337 for (
int i = 1; i < threads; i++) {
339 sum += D[
id + i * reduction_size];
345 if (
id < reduction_size) {
347 sum = D[
id * threads];
348 for (
int i = 1; i < threads; i++) {
349 sum += D[
id * threads + i];
351 D[
id * threads] = sum;
356#ifdef ALT_VECTOR_REDUCTION
359__global__
void sum_blocked_vectorreduction_k2(T *D,
const int reduction_size,
const int threads) {
361 int id = threadIdx.x + blockIdx.x * blockDim.x;
363 for (;
id < reduction_size;
id += threads) {
364 D[id] = D[
id * threads];
372void sum_blocked_vectorreduction(T *data,
const int reduction_size,
const int threads) {
378 sum_blocked_vectorreduction_kernel<<<blocks, N_threads>>>(data, reduction_size, threads);
380#ifdef ALT_VECTOR_REDUCTION
382 sum_blocked_vectorreduction_k2<<<1, N_threads>>>(data, reduction_size, threads);
385 check_device_error(
"sum_blocked_vectorreduction");
392__global__
void gpu_set_value_kernel(T *vector, T value,
int elems) {
393 int Index = threadIdx.x + blockIdx.x * blockDim.x;
395 vector[Index] = value;
401__global__
void gpu_set_value_kernel_ptr(T *vector,
const T* valptr,
int elems) {
402 int Index = threadIdx.x + blockIdx.x * blockDim.x;
404 vector[Index] = *valptr;
420inline void gpu_set_zero(T *vec,
size_t N) {
421 gpuMemset(vec, 0, N *
sizeof(T));
426void gpu_set_value(T *vec,
const T &val,
size_t N) {
430 gpu_set_value_kernel<<<blocks, N_threads>>>(vec, val, N);
434 gpuMalloc(&buf,
sizeof(T));
435 gpuMemcpy(buf, (
char *)&val,
sizeof(T), gpuMemcpyHostToDevice);
438 gpu_set_value_kernel_ptr<<<blocks, N_threads>>>(vec, buf, N);
This file defines all includes for HILA.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
#define N_threads
General number of threads in a thread block.