1#ifndef HILA_GPU_TEMPLATED_OPS_H
2#define HILA_GPU_TEMPLATED_OPS_H
5#include "plumbing/backend_gpu/defs.h"
8#include "plumbing/random.h"
10#include "plumbing/type_tools.h"
17namespace gpucub = cub;
18using gpuStream_t = cudaStream_t;
25#include <hipcub/hipcub.hpp>
26namespace gpucub = hipcub;
27using gpuStream_t = hipStream_t;
64template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
65__device__
static void _hila_kernel_set_zero(T &v) {
69template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
70__device__
static void _hila_kernel_set_zero(T &v) {
71 using ntype = hila::arithmetic_type<T>;
72 constexpr int N =
sizeof(T) /
sizeof(ntype);
74 ntype *arr =
reinterpret_cast<ntype *
>(&v);
75 for (
int i = 0; i < N; i++)
79template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
80__device__
static void _hila_kernel_copy_var(T &a,
const T &b) {
84template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
85__device__
static void _hila_kernel_copy_var(T &a,
const T &b) {
86 using ntype = hila::arithmetic_type<T>;
87 constexpr int N =
sizeof(T) /
sizeof(ntype);
89 ntype *ar =
reinterpret_cast<ntype *
>(&a);
90 const ntype *br =
reinterpret_cast<const ntype *
>(&b);
92 for (
int i = 0; i < N; i++)
96template <typename T, std::enable_if_t<hila::is_arithmetic<T>::value,
int> = 0>
97__device__
static void _hila_kernel_add_var(T &a,
const T &b) {
101template <typename T, std::enable_if_t<!hila::is_arithmetic<T>::value,
int> = 0>
102__device__
static void _hila_kernel_add_var(T &a,
const T &b) {
103 using ntype = hila::arithmetic_type<T>;
104 constexpr int N =
sizeof(T) /
sizeof(ntype);
106 ntype *ar =
reinterpret_cast<ntype *
>(&a);
107 const ntype *br =
reinterpret_cast<const ntype *
>(&b);
109 for (
int i = 0; i < N; i++)
118__global__
void gpu_reduce_sum_kernel(T *vector,
int vector_size,
int new_size,
int elems) {
119 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
120 if (Index < new_size) {
121 for (
int i = 1; i < elems; i++) {
122 int ind = Index + i * new_size;
124 _hila_kernel_add_var(vector[Index], vector[ind]);
130T gpu_reduce_sum(T *vector,
int N) {
131 const int reduce_step = 32;
133 T *host_vector = (T *)memalloc(N *
sizeof(T));
136 while (vector_size > reduce_step) {
138 int first = vector_size % reduce_step;
140 int new_size = (vector_size - first) / reduce_step;
142 int blocks = (new_size - 1) /
N_threads + 1;
143 gpu_reduce_sum_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
145 check_device_error(
"gpu_reduce_sum kernel");
147 vector_size = new_size + first;
148 gpuDeviceSynchronize();
150 gpuMemcpy(host_vector, vector, vector_size *
sizeof(T), gpuMemcpyDeviceToHost);
152 for (
int i = 0; i < vector_size; i++) {
153 sum += host_vector[i];
162__global__
void gpu_reduce_product_kernel(T *vector,
int vector_size,
int new_size,
int elems) {
163 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
164 if (Index < new_size) {
165 for (
int i = 1; i < elems; i++) {
166 int ind = Index + i * new_size;
167 vector[Index] *= vector[ind];
173T gpu_reduce_product(T *vector,
int N) {
174 const int reduce_step = 32;
177 T *host_vector = (T *)malloc(N *
sizeof(T));
179 while (vector_size > reduce_step) {
181 int first = vector_size % reduce_step;
183 int new_size = (vector_size - first) / reduce_step;
186 gpu_reduce_product_kernel<<<blocks, N_threads>>>(vector + first, vector_size, new_size,
190 vector_size = new_size + first;
192 gpuDeviceSynchronize();
195 check_device_error(
"gpu_reduce_product kernel");
197 gpuMemcpy(host_vector, vector, vector_size *
sizeof(T), gpuMemcpyDeviceToHost);
199 for (
int i = 0; i < vector_size; i++) {
200 prod *= host_vector[i];
210void gpu_multireduce_sum(std::vector<T> &vector, T *d_array,
int N) {
211 for (
int v = 0; v < vector.size(); v++) {
212 vector[v] += gpu_reduce_sum(d_array + v * N, N);
217void gpu_multireduce_product(std::vector<T> vector, T *d_array,
int N) {
218 for (
int v = 0; v < vector.size(); v++) {
219 vector[v] += gpu_reduce_product(d_array + v * N, N);
231#if __CUDA_ARCH__ < 600
233__device__
inline double atomic_Add(
double *dp,
double v) {
235 unsigned long long int *dp_ull = (
unsigned long long int *)dp;
236 unsigned long long int old = *dp_ull;
237 unsigned long long int av;
241 old = atomicCAS(dp_ull, av, __double_as_longlong(v + __longlong_as_double(av)));
246 return __longlong_as_double(old);
250__device__
inline double atomic_Add(
double *dp,
double v) {
251 return atomicAdd(dp, v);
258__device__
inline float atomic_Add(
float *dp,
float v) {
259 return atomicAdd(dp, v);
269template <
typename T,
typename B,
270 std::enable_if_t<std::is_integral<T>::value &&
sizeof(T) ==
sizeof(
unsigned long long) &&
271 std::is_convertible<B, T>::value,
273__device__
inline T atomicAdd(T *dp, B v) {
276 return atomicAdd((
unsigned long long int *)dp, (
unsigned long long int)tv);
282 typename T,
typename B,
283 std::enable_if_t<!std::is_arithmetic<T>::value && std::is_assignable<T &, T>::value,
int> = 0>
284__device__
inline void atomicAdd(T *d,
const B &bv) {
288 hila::arithmetic_type<T> *dp;
289 const hila::arithmetic_type<T> *dv;
290 constexpr int N =
sizeof(T) /
sizeof(hila::arithmetic_type<T>);
292 dp = (hila::arithmetic_type<T> *)(
void *)d;
293 dv = (hila::arithmetic_type<T> *)(
void *)&v;
295 for (
int i = 0; i < N; ++i) {
296 atomic_Add(dp + i, dv[i]);
301__device__
inline double atomicMultiply(
double *dp,
double v) {
303 unsigned long long int *dp_ull = (
unsigned long long int *)dp;
304 unsigned long long int old = *dp_ull;
305 unsigned long long int av;
309 old = atomicCAS(dp_ull, av, __double_as_longlong(v * __longlong_as_double(av)));
314 return __longlong_as_double(old);
318__device__
inline float atomicMultiply(
float *dp,
float v) {
319 unsigned int *dp_ui = (
unsigned int *)dp;
320 unsigned int old = *dp_ui;
325 old = atomicCAS(dp_ui, av, __float_as_int(v * __int_as_float(av)));
330 return __int_as_float(old);
346template <
typename T,
int BLOCK_SIZE>
347__global__
void sum_blocked_vectorreduction_kernel_cub(T *D, T *output,
int reduction_size,
349 using BlockReduce = gpucub::BlockReduce<T, BLOCK_SIZE>;
350 __shared__
typename BlockReduce::TempStorage temp_storage;
352 int global_thread_id = blockIdx.x * threads + threadIdx.x;
353 int stride = BLOCK_SIZE;
355 _hila_kernel_set_zero(local_sum);
357 for (
int i = 0; i < max(1, (threads + stride - 1) / stride); i++) {
358 int index = global_thread_id + i * stride;
359 if (index < reduction_size * threads) {
360 _hila_kernel_add_var(local_sum, D[index]);
365 total_sum = BlockReduce(temp_storage).Sum(local_sum);
366 if (threadIdx.x == 0) {
367 _hila_kernel_copy_var(output[blockIdx.x], total_sum);
384void sum_blocked_vectorreduction(T *data, T *output,
const int reduction_size,
const int threads) {
386 gpuMalloc(&output_device, reduction_size *
sizeof(T));
387 sum_blocked_vectorreduction_kernel_cub<T, GPU_BLOCK_REDUCTION_THREADS>
391 hila::synchronize_threads();
392 gpuMemcpy(output, output_device, reduction_size *
sizeof(T), gpuMemcpyDeviceToHost);
393 gpuFree(output_device);
394 check_device_error(
"sum_blocked_vectorreduction");
401__global__
void gpu_set_value_kernel(T *vector, T value,
int elems) {
402 int Index = threadIdx.x + blockIdx.x * blockDim.x;
404 vector[Index] = value;
410__global__
void gpu_set_value_kernel_ptr(T *vector,
const T *valptr,
int elems) {
411 int Index = threadIdx.x + blockIdx.x * blockDim.x;
413 vector[Index] = *valptr;
429inline void gpu_set_zero(T *vec,
size_t N) {
430 gpuMemset(vec, 0, N *
sizeof(T));
435void gpu_set_value(T *vec,
const T &val,
size_t N) {
439 gpu_set_value_kernel<<<blocks, N_threads>>>(vec, val, N);
443 gpuMalloc(&buf,
sizeof(T));
444 gpuMemcpy(buf, (
char *)&val,
sizeof(T), gpuMemcpyHostToDevice);
447 gpu_set_value_kernel_ptr<<<blocks, N_threads>>>(vec, buf, N);
462T * copy_array_to_gpu(T *data,
int n) {
464 gpuMalloc(&buf,n*
sizeof(T));
465 gpuMemcpy(buf, data, n *
sizeof(T), gpuMemcpyHostToDevice);
This file defines all includes for HILA.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
#define N_threads
General number of threads in a thread block.
#define GPU_BLOCK_REDUCTION_THREADS