HILA
Loading...
Searching...
No Matches
backend_gpu/field_storage_backend.h
1#ifndef GPU_BACKEND_H
2#define GPU_BACKEND_H
3
4#include "../defs.h"
5#include "../field_storage.h"
6
7/* CUDA / HIP implementations */
8template <typename T>
10 // Allocate space for the field of the device
11 gpuMalloc(&fieldbuf, sizeof(T) * lattice->mynode.field_alloc_size);
12 if (fieldbuf == nullptr) {
13 std::cout << "Failure in field memory allocation\n";
14 }
15 assert(fieldbuf != nullptr);
16}
17
18template <typename T>
20 if (fieldbuf != nullptr) {
21 gpuFree(fieldbuf);
22 }
23 fieldbuf = nullptr;
24}
25
26// Only attempt to compile with CUDA compiler.
27// Hilapp will skip these.
28// #if defined(__CUDACC__) || defined(__HIPCC__)
29#if !defined(HILAPP)
30
31// These are used in device code. Can be called directly in a kernel.
32template <typename T>
33__device__ inline auto field_storage<T>::get(const unsigned i,
34 const unsigned field_alloc_size) const {
35 // assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
38 union {
39 T value;
40 base_t arr[n_elements];
41 } u;
42 const base_t *fp = (base_t *)(fieldbuf);
43 for (unsigned e = 0; e < n_elements; e++) {
44 u.arr[e] = fp[e * field_alloc_size + i];
45 }
46 return u.value;
47
48 // T value;
49 // base_t *value_f = (base_t *)&value;
50 // base_t *fp = (base_t *)(fieldbuf);
51 // for (unsigned e = 0; e < n_elements; e++) {
52 // value_f[e] = fp[e * field_alloc_size + i];
53 // }
54 // return value;
55}
56
57template <typename T>
58// template <typename A>
59__device__ inline void field_storage<T>::set(const T &value, const unsigned i,
60 const unsigned field_alloc_size) {
61 // assert(i < field_alloc_size);
62 using base_t = hila::arithmetic_type<T>;
63 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
64
65 const base_t *value_f = (base_t *)&value;
66 base_t *fp = (base_t *)(fieldbuf);
67 for (unsigned e = 0; e < n_elements; e++) {
68 fp[e * field_alloc_size + i] = value_f[e];
69 }
70}
71
72/// Get a single element from the field outside a loop. Slow, should only be used for
73/// setup
74template <typename T>
75__global__ void get_element_kernel(field_storage<T> field, char *buffer, unsigned i,
76 const unsigned field_alloc_size) {
77 *((T *)buffer) = field.get(i, field_alloc_size);
78}
79
80template <typename T>
81auto field_storage<T>::get_element(const unsigned i, const Lattice lattice) const {
82 char *d_buffer;
83 T value;
84
85 // Call the kernel to collect the element
86 gpuMalloc(&(d_buffer), sizeof(T));
87 get_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice->mynode.field_alloc_size);
88
89 // Copy the result to the host
90 gpuMemcpy((char *)(&value), d_buffer, sizeof(T), gpuMemcpyDeviceToHost);
91 gpuFree(d_buffer);
92 return value;
93}
94
95/// Set a single element from outside a loop. Slow, should only be used for setup
96template <typename T>
97__global__ void set_element_kernel(field_storage<T> field, T value, unsigned i,
98 const unsigned field_alloc_size) {
99 field.set(value, i, field_alloc_size);
100}
101
102template <typename T>
103__global__ void set_element_kernel_ptr(field_storage<T> field, const T *buf, unsigned i,
104 const unsigned field_alloc_size) {
105 field.set(*buf, i, field_alloc_size);
106}
107
108
109template <typename T>
110template <typename A>
111void field_storage<T>::set_element(A &value, const unsigned i, const Lattice lattice) {
112 T t_value = value;
113
114 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE) {
115 // pass element directly as arg
116 set_element_kernel<<<1, 1>>>(*this, t_value, i, lattice->mynode.field_alloc_size);
117 } else {
118 // This branch is used for large variables:
119 // allocate space and copy the buffer to the device
120 T *d_buffer;
121 gpuMalloc(&(d_buffer), sizeof(T));
122 gpuMemcpy(d_buffer, (char *)&t_value, sizeof(T), gpuMemcpyHostToDevice);
123
124 // call the kernel to set correct indexes
125 set_element_kernel_ptr<<<1, 1>>>(*this, d_buffer, i, lattice->mynode.field_alloc_size);
126 gpuFree(d_buffer);
127 }
128}
129
130/// A kernel that gathers elements
131template <typename T>
132__global__ void gather_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
133 const int n, const unsigned field_alloc_size) {
134 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
135 if (Index < n) {
136 buffer[Index] = field.get(site_index[Index], field_alloc_size);
137 }
138}
139
140/// CUDA implementation of gather_elements without CUDA aware MPI
141template <typename T>
142void field_storage<T>::gather_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
143 int n, const Lattice lattice) const {
144 unsigned *d_site_index;
145 T *d_buffer;
146
147 // Copy the list of boundary site indexes to the device
148 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
149 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
150
151 // Call the kernel to build the list of elements
152 gpuMalloc(&(d_buffer), n * sizeof(T));
153 int N_blocks = n / N_threads + 1;
154 gather_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
155 lattice->mynode.field_alloc_size);
156
157 // Copy the result to the host
158 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
159
160 gpuFree(d_site_index);
161 gpuFree(d_buffer);
162}
163
164#ifdef SPECIAL_BOUNDARY_CONDITIONS
165
166/// A kernel that gathers elements negated
167// requires unary -
168template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
169__global__ void gather_elements_negated_kernel(field_storage<T> field, T *buffer,
170 unsigned *site_index, const int n,
171 const unsigned field_alloc_size) {
172 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
173 if (Index < n) {
174 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
175 }
176}
177
178/// CUDA implementation of gather_elements_negated without CUDA aware MPI
179// The only difference to the above is the kernel that is called
180template <typename T>
182 const unsigned *RESTRICT index_list, int n,
183 const Lattice lattice) const {
184 unsigned *d_site_index;
185 T *d_buffer;
186
187 if constexpr (!hila::has_unary_minus<T>::value) {
188 assert(sizeof(T) < 0 && "Unary 'operatur- ()' must be defined for Field variable "
189 "for antiperiodic b.c.");
190 } else {
191
192 // Copy the list of boundary site indexes to the device
193 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
194 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
195
196 // Call the kernel to build the list of elements
197 gpuMalloc(&(d_buffer), n * sizeof(T));
198 int N_blocks = n / N_threads + 1;
199 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
200 lattice->mynode.field_alloc_size);
201
202 // Copy the result to the host
203 gpuMemcpy(buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
204
205 gpuFree(d_site_index);
206 gpuFree(d_buffer);
207 }
208}
209
210#endif
211
212template <typename T>
213__global__ void gather_comm_elements_kernel(field_storage<T> field, T *buffer,
214 const unsigned *site_index, const int n,
215 const unsigned field_alloc_size) {
216 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
217 if (Index < n) {
218 using base_t = hila::arithmetic_type<T>;
219 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
220 T element = field.get(site_index[Index], field_alloc_size);
221 base_t *ep = (base_t *)&element;
222 base_t *fp = (base_t *)(buffer);
223 for (unsigned e = 0; e < n_elements; e++) {
224 fp[Index + n * e] = ep[e];
225 }
226 }
227}
228
229// gather -elements only if unary - exists
230template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
231__global__ void gather_comm_elements_negated_kernel(field_storage<T> field, T *buffer,
232 const unsigned *site_index, const int n,
233 const unsigned field_alloc_size) {
234 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
235 if (Index < n) {
236 using base_t = hila::arithmetic_type<T>;
237 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
238 T element = -field.get(site_index[Index], field_alloc_size);
239 base_t *ep = (base_t *)&element;
240 base_t *fp = (base_t *)(buffer);
241 for (unsigned e = 0; e < n_elements; e++) {
242 fp[Index + n * e] = ep[e];
243 }
244 }
245}
246
247
248// MPI buffer on the device. Use the gather_elements and gather_elements_negated
249// kernels to fill the buffer.
250template <typename T>
253 Parity par, const Lattice lattice,
254 bool antiperiodic) const {
255 int n;
256 const unsigned *d_site_index = to_node.get_sitelist(par, n);
257 T *d_buffer;
258
259#ifdef GPU_AWARE_MPI
260 // The buffer is already on the device
261 d_buffer = buffer;
262#else
263 // Allocate a buffer on the device
264 gpuMalloc(&(d_buffer), n * sizeof(T));
265#endif
266
267 // Call the kernel to build the list of elements
268 int N_blocks = n / N_threads + 1;
269#ifdef SPECIAL_BOUNDARY_CONDITIONS
270 if (antiperiodic) {
271
272 if constexpr (hila::has_unary_minus<T>::value) {
273 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
274 *this, d_buffer, d_site_index, n, lattice->mynode.field_alloc_size);
275 }
276
277 } else {
278 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
279 lattice->mynode.field_alloc_size);
280 }
281#else
282 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
283 lattice->mynode.field_alloc_size);
284#endif
285
286#ifndef GPU_AWARE_MPI
287 // Copy the result to the host
288 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
289 gpuFree(d_buffer);
290#endif
291}
292
293/// A kernel that scatters the elements
294template <typename T>
295__global__ void place_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
296 const int n, const unsigned field_alloc_size) {
297 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
298 if (Index < n) {
299 field.set(buffer[Index], site_index[Index], field_alloc_size);
300 }
301}
302
303/// CUDA implementation of place_elements without CUDA aware MPI
304template <typename T>
305void field_storage<T>::place_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
306 int n, const Lattice lattice) {
307 unsigned *d_site_index;
308 T *d_buffer;
309
310 // Allocate space and copy the buffer to the device
311 gpuMalloc(&(d_buffer), n * sizeof(T));
312 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
313
314 // Copy the list of boundary site indexes to the device
315 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
316 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
317
318 // Call the kernel to place the elements
319 int N_blocks = n / N_threads + 1;
320 place_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
321 lattice->mynode.field_alloc_size);
322
323 gpuFree(d_buffer);
324 gpuFree(d_site_index);
325}
326
327template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
328__global__ void set_local_boundary_elements_kernel(field_storage<T> field, unsigned offset,
329 unsigned *site_index, const int n,
330 const unsigned field_alloc_size) {
331 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
332 if (Index < n) {
333 T value;
334 value = -field.get(site_index[Index], field_alloc_size);
335 field.set(value, offset + Index, field_alloc_size);
336 }
337}
338
339template <typename T>
341 const Lattice lattice,
342 bool antiperiodic) {
343 // Only need to do something for antiperiodic boundaries
344#ifdef SPECIAL_BOUNDARY_CONDITIONS
345 if (antiperiodic) {
346 if constexpr (hila::has_unary_minus<T>::value) {
347 unsigned n, start = 0;
348 if (par == ODD) {
349 n = lattice->special_boundaries[dir].n_odd;
350 start = lattice->special_boundaries[dir].n_even;
351 } else {
352 if (par == EVEN)
353 n = lattice->special_boundaries[dir].n_even;
354 else
355 n = lattice->special_boundaries[dir].n_total;
356 }
357 unsigned offset = lattice->special_boundaries[dir].offset + start;
358
359 unsigned *d_site_index;
360 check_device_error("earlier");
361 gpuMalloc(&d_site_index, n * sizeof(unsigned));
362 gpuMemcpy(d_site_index, lattice->special_boundaries[dir].move_index + start,
363 n * sizeof(unsigned), gpuMemcpyHostToDevice);
364
365 unsigned N_blocks = n / N_threads + 1;
366 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
367 *this, offset, d_site_index, n, lattice->mynode.field_alloc_size);
368
369 gpuFree(d_site_index);
370 } else {
371 assert("Antiperiodic b.c. cannot be used with unsigned field elements");
372 }
373 }
374
375#else
376 assert(!antiperiodic && "antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
377#endif
378}
379
380// Place communicated elements to the field array
381template <typename T>
382__global__ void place_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned offset,
383 const int n, const unsigned field_alloc_size) {
384 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
385 if (Index < n) {
386 using base_t = hila::arithmetic_type<T>;
387 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
388 T element;
389 base_t *ep = (base_t *)&element;
390 base_t *fp = (base_t *)(buffer);
391 for (unsigned e = 0; e < n_elements; e++) {
392 ep[e] = fp[Index + n * e];
393 }
394 field.set(element, offset + Index, field_alloc_size);
395 }
396}
397
398// Standard MPI, buffer is on the cpu and needs to be copied accross
399template <typename T>
401 const lattice_struct::comm_node_struct &from_node,
402 const Lattice lattice) {
403
404 unsigned n = from_node.n_sites(par);
405 T *d_buffer;
406
407#ifdef GPU_AWARE_MPI
408 // MPI buffer is on device
409 d_buffer = buffer;
410#else
411 // Allocate space and copy the buffer to the device
412 gpuMalloc(&(d_buffer), n * sizeof(T));
413 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
414#endif
415
416 unsigned N_blocks = n / N_threads + 1;
417 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
418 lattice->mynode.field_alloc_size);
419
420#ifndef GPU_AWARE_MPI
421 gpuFree(d_buffer);
422#endif
423}
424
425#ifdef GPU_AWARE_MPI
426
427template <typename T>
428void field_storage<T>::free_mpi_buffer(T *d_buffer) {
429 gpuFree(d_buffer);
430}
431
432template <typename T>
434 T *d_buffer;
435 gpuMalloc(&(d_buffer), n * sizeof(T));
436 return d_buffer;
437}
438
439#else
440
441template <typename T>
442void field_storage<T>::free_mpi_buffer(T *buffer) {
443 std::free(buffer);
444}
445
446template <typename T>
448 return (T *)memalloc(n * sizeof(T));
449}
450
451#endif
452
453#else // now HILAPP
454
455// Hilapp requires a dummy implementation for functions with auto return type
456template <typename T>
457auto field_storage<T>::get(const unsigned i, const unsigned field_alloc_size) const {
458 T value;
459 return value;
460}
461template <typename T>
462auto field_storage<T>::get_element(const unsigned i, const Lattice lattice) const {
463 T value;
464 return value;
465}
466
467#endif // !HILAPP .. HILAPP
468
469#endif
main interface class to lattices.
Definition lattice.h:396
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
auto get_element(const unsigned i, const Lattice lattice) const
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const Lattice lattice)
Place boundary elements from neighbour.
void set_local_boundary_elements(Direction dir, Parity par, const Lattice lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
#define RESTRICT
Definition defs.h:52
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
Definition params.h:200
#define N_threads
General number of threads in a thread block.
Definition params.h:212
Information necessary to communicate with a node.
Definition lattice.h:137