HILA
Loading...
Searching...
No Matches
backend_gpu/field_storage_backend.h
1#ifndef GPU_BACKEND_H
2#define GPU_BACKEND_H
3
4#include "../defs.h"
5#include "../field_storage.h"
6
7/* CUDA / HIP implementations */
8template <typename T>
10 // Allocate space for the field of the device
11 gpuMalloc(&fieldbuf, sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf == nullptr) {
13 std::cout << "Failure in field memory allocation\n";
14 }
15 assert(fieldbuf != nullptr);
16}
17
18template <typename T>
20 if (fieldbuf != nullptr) {
21 gpuFree(fieldbuf);
22 }
23 fieldbuf = nullptr;
24}
25
26// Only attempt to compile with CUDA compiler.
27// Hilapp will skip these.
28// #if defined(__CUDACC__) || defined(__HIPCC__)
29#if !defined(HILAPP)
30
31// These are used in device code. Can be called directly in a kernel.
32template <typename T>
33__device__ inline auto field_storage<T>::get(const unsigned i,
34 const unsigned field_alloc_size) const {
35 // assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
38 union {
39 T value;
40 base_t arr[n_elements];
41 } u;
42 const base_t *fp = (base_t *)(fieldbuf);
43 for (unsigned e = 0; e < n_elements; e++) {
44 u.arr[e] = fp[e * field_alloc_size + i];
45 }
46 return u.value;
47
48 // T value;
49 // base_t *value_f = (base_t *)&value;
50 // base_t *fp = (base_t *)(fieldbuf);
51 // for (unsigned e = 0; e < n_elements; e++) {
52 // value_f[e] = fp[e * field_alloc_size + i];
53 // }
54 // return value;
55}
56
57template <typename T>
58// template <typename A>
59__device__ inline void field_storage<T>::set(const T &value, const unsigned i,
60 const unsigned field_alloc_size) {
61 // assert(i < field_alloc_size);
62 using base_t = hila::arithmetic_type<T>;
63 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
64
65 const base_t *value_f = (base_t *)&value;
66 base_t *fp = (base_t *)(fieldbuf);
67 for (unsigned e = 0; e < n_elements; e++) {
68 fp[e * field_alloc_size + i] = value_f[e];
69 }
70}
71
72/// Get a single element from the field outside a loop. Slow, should only be used for
73/// setup
74template <typename T>
75__global__ void get_element_kernel(field_storage<T> field, char *buffer, unsigned i,
76 const unsigned field_alloc_size) {
77 *((T *)buffer) = field.get(i, field_alloc_size);
78}
79
80template <typename T>
81auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
82 char *d_buffer;
83 T value;
84
85 // Call the kernel to collect the element
86 gpuMalloc(&(d_buffer), sizeof(T));
87 get_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
88
89 // Copy the result to the host
90 gpuMemcpy((char *)(&value), d_buffer, sizeof(T), gpuMemcpyDeviceToHost);
91 gpuFree(d_buffer);
92 return value;
93}
94
95/// Set a single element from outside a loop. Slow, should only be used for setup
96template <typename T>
97__global__ void set_element_kernel(field_storage<T> field, T value, unsigned i,
98 const unsigned field_alloc_size) {
99 field.set(value, i, field_alloc_size);
100}
101
102template <typename T>
103__global__ void set_element_kernel_ptr(field_storage<T> field, const T *buf, unsigned i,
104 const unsigned field_alloc_size) {
105 field.set(*buf, i, field_alloc_size);
106}
107
108
109template <typename T>
110template <typename A>
111void field_storage<T>::set_element(A &value, const unsigned i, const lattice_struct &lattice) {
112 T t_value = value;
113
114 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE) {
115 // pass element directly as arg
116 set_element_kernel<<<1, 1>>>(*this, t_value, i, lattice.field_alloc_size());
117 } else {
118 // This branch is used for large variables:
119 // allocate space and copy the buffer to the device
120 T *d_buffer;
121 gpuMalloc(&(d_buffer), sizeof(T));
122 gpuMemcpy(d_buffer, (char *)&t_value, sizeof(T), gpuMemcpyHostToDevice);
123
124 // call the kernel to set correct indexes
125 set_element_kernel_ptr<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
126 gpuFree(d_buffer);
127 }
128}
129
130/// A kernel that gathers elements
131template <typename T>
132__global__ void gather_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
133 const int n, const unsigned field_alloc_size) {
134 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
135 if (Index < n) {
136 buffer[Index] = field.get(site_index[Index], field_alloc_size);
137 }
138}
139
140/// CUDA implementation of gather_elements without CUDA aware MPI
141template <typename T>
142void field_storage<T>::gather_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
143 int n, const lattice_struct &lattice) const {
144 unsigned *d_site_index;
145 T *d_buffer;
146
147 // Copy the list of boundary site indexes to the device
148 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
149 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
150
151 // Call the kernel to build the list of elements
152 gpuMalloc(&(d_buffer), n * sizeof(T));
153 int N_blocks = n / N_threads + 1;
154 gather_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
155 lattice.field_alloc_size());
156
157 // Copy the result to the host
158 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
159
160 gpuFree(d_site_index);
161 gpuFree(d_buffer);
162}
163
164#ifdef SPECIAL_BOUNDARY_CONDITIONS
165
166/// A kernel that gathers elements negated
167// requires unary -
168template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
169__global__ void gather_elements_negated_kernel(field_storage<T> field, T *buffer,
170 unsigned *site_index, const int n,
171 const unsigned field_alloc_size) {
172 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
173 if (Index < n) {
174 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
175 }
176}
177
178/// CUDA implementation of gather_elements_negated without CUDA aware MPI
179// The only difference to the above is the kernel that is called
180template <typename T>
182 const unsigned *RESTRICT index_list, int n,
183 const lattice_struct &lattice) const {
184 unsigned *d_site_index;
185 T *d_buffer;
186
187 if constexpr (!hila::has_unary_minus<T>::value) {
188 assert(sizeof(T) < 0 && "Unary 'operatur- ()' must be defined for Field variable "
189 "for antiperiodic b.c.");
190 } else {
191
192 // Copy the list of boundary site indexes to the device
193 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
194 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
195
196 // Call the kernel to build the list of elements
197 gpuMalloc(&(d_buffer), n * sizeof(T));
198 int N_blocks = n / N_threads + 1;
199 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
200 lattice.field_alloc_size());
201
202 // Copy the result to the host
203 gpuMemcpy(buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
204
205 gpuFree(d_site_index);
206 gpuFree(d_buffer);
207 }
208}
209
210#endif
211
212template <typename T>
213__global__ void gather_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
214 const int n, const unsigned field_alloc_size) {
215 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
216 if (Index < n) {
217 using base_t = hila::arithmetic_type<T>;
218 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
219 T element = field.get(site_index[Index], field_alloc_size);
220 base_t *ep = (base_t *)&element;
221 base_t *fp = (base_t *)(buffer);
222 for (unsigned e = 0; e < n_elements; e++) {
223 fp[Index + n * e] = ep[e];
224 }
225 }
226}
227
228// gather -elements only if unary - exists
229template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
230__global__ void gather_comm_elements_negated_kernel(field_storage<T> field, T *buffer,
231 unsigned *site_index, const int n,
232 const unsigned field_alloc_size) {
233 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
234 if (Index < n) {
235 using base_t = hila::arithmetic_type<T>;
236 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
237 T element = -field.get(site_index[Index], field_alloc_size);
238 base_t *ep = (base_t *)&element;
239 base_t *fp = (base_t *)(buffer);
240 for (unsigned e = 0; e < n_elements; e++) {
241 fp[Index + n * e] = ep[e];
242 }
243 }
244}
245
246// Index list is constant? Map each cpu pointer to a device pointer and copy just once
247struct cuda_comm_node_struct {
248 const unsigned *cpu_index;
249 unsigned *gpu_index;
250 int n;
251};
252
253inline unsigned *get_site_index(const lattice_struct::comm_node_struct &to_node, Parity par,
254 int &n) {
255 static std::vector<struct cuda_comm_node_struct> comm_nodes;
256
257 const unsigned *cpu_index = to_node.get_sitelist(par, n);
258 for (struct cuda_comm_node_struct comm_node : comm_nodes) {
259 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
260 return comm_node.gpu_index;
261 }
262 }
263 struct cuda_comm_node_struct comm_node;
264 comm_node.cpu_index = cpu_index;
265 comm_node.n = n;
266 gpuMalloc(&(comm_node.gpu_index), n * sizeof(unsigned));
267 gpuMemcpy(comm_node.gpu_index, cpu_index, n * sizeof(unsigned), gpuMemcpyHostToDevice);
268 comm_nodes.push_back(comm_node);
269 return comm_node.gpu_index;
270}
271
272// MPI buffer on the device. Use the gather_elements and gather_elements_negated
273// kernels to fill the buffer.
274template <typename T>
277 Parity par, const lattice_struct &lattice,
278 bool antiperiodic) const {
279 int n;
280 unsigned *d_site_index = get_site_index(to_node, par, n);
281 T *d_buffer;
282
283#ifdef GPU_AWARE_MPI
284 // The buffer is already on the device
285 d_buffer = buffer;
286#else
287 // Allocate a buffer on the device
288 gpuMalloc(&(d_buffer), n * sizeof(T));
289#endif
290
291 // Call the kernel to build the list of elements
292 int N_blocks = n / N_threads + 1;
293 if (antiperiodic) {
294
295 if constexpr (hila::has_unary_minus<T>::value) {
296 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
297 *this, d_buffer, d_site_index, n, lattice.field_alloc_size());
298 }
299
300 } else {
301 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
302 lattice.field_alloc_size());
303 }
304
305#ifndef GPU_AWARE_MPI
306 // Copy the result to the host
307 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
308 gpuFree(d_buffer);
309#endif
310}
311
312/// A kernel that scatters the elements
313template <typename T>
314__global__ void place_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
315 const int n, const unsigned field_alloc_size) {
316 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
317 if (Index < n) {
318 field.set(buffer[Index], site_index[Index], field_alloc_size);
319 }
320}
321
322/// CUDA implementation of place_elements without CUDA aware MPI
323template <typename T>
324void field_storage<T>::place_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
325 int n, const lattice_struct &lattice) {
326 unsigned *d_site_index;
327 T *d_buffer;
328
329 // Allocate space and copy the buffer to the device
330 gpuMalloc(&(d_buffer), n * sizeof(T));
331 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
332
333 // Copy the list of boundary site indexes to the device
334 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
335 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
336
337 // Call the kernel to place the elements
338 int N_blocks = n / N_threads + 1;
339 place_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
340 lattice.field_alloc_size());
341
342 gpuFree(d_buffer);
343 gpuFree(d_site_index);
344}
345
346template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
347__global__ void set_local_boundary_elements_kernel(field_storage<T> field, unsigned offset,
348 unsigned *site_index, const int n,
349 const unsigned field_alloc_size) {
350 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
351 if (Index < n) {
352 T value;
353 value = -field.get(site_index[Index], field_alloc_size);
354 field.set(value, offset + Index, field_alloc_size);
355 }
356}
357
358template <typename T>
360 const lattice_struct &lattice,
361 bool antiperiodic) {
362 // Only need to do something for antiperiodic boundaries
363#ifdef SPECIAL_BOUNDARY_CONDITIONS
364 if (antiperiodic) {
365 if constexpr (hila::has_unary_minus<T>::value) {
366 unsigned n, start = 0;
367 if (par == ODD) {
368 n = lattice.special_boundaries[dir].n_odd;
369 start = lattice.special_boundaries[dir].n_even;
370 } else {
371 if (par == EVEN)
372 n = lattice.special_boundaries[dir].n_even;
373 else
374 n = lattice.special_boundaries[dir].n_total;
375 }
376 unsigned offset = lattice.special_boundaries[dir].offset + start;
377
378 unsigned *d_site_index;
379 check_device_error("earlier");
380 gpuMalloc(&d_site_index, n * sizeof(unsigned));
381 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
382 n * sizeof(unsigned), gpuMemcpyHostToDevice);
383
384 unsigned N_blocks = n / N_threads + 1;
385 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
386 *this, offset, d_site_index, n, lattice.field_alloc_size());
387
388 gpuFree(d_site_index);
389 } else {
390 assert("Antiperiodic b.c. cannot be used with unsigned field elements");
391 }
392 }
393
394#else
395 assert(!antiperiodic && "antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
396#endif
397}
398
399// Place communicated elements to the field array
400template <typename T>
401__global__ void place_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned offset,
402 const int n, const unsigned field_alloc_size) {
403 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
404 if (Index < n) {
405 using base_t = hila::arithmetic_type<T>;
406 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
407 T element;
408 base_t *ep = (base_t *)&element;
409 base_t *fp = (base_t *)(buffer);
410 for (unsigned e = 0; e < n_elements; e++) {
411 ep[e] = fp[Index + n * e];
412 }
413 field.set(element, offset + Index, field_alloc_size);
414 }
415}
416
417// Standard MPI, buffer is on the cpu and needs to be copied accross
418template <typename T>
420 const lattice_struct::comm_node_struct &from_node,
421 const lattice_struct &lattice) {
422
423 unsigned n = from_node.n_sites(par);
424 T *d_buffer;
425
426#ifdef GPU_AWARE_MPI
427 // MPI buffer is on device
428 d_buffer = buffer;
429#else
430 // Allocate space and copy the buffer to the device
431 gpuMalloc(&(d_buffer), n * sizeof(T));
432 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
433#endif
434
435 unsigned N_blocks = n / N_threads + 1;
436 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
437 lattice.field_alloc_size());
438
439#ifndef GPU_AWARE_MPI
440 gpuFree(d_buffer);
441#endif
442}
443
444#ifdef GPU_AWARE_MPI
445
446template <typename T>
447void field_storage<T>::free_mpi_buffer(T *d_buffer) {
448 gpuFree(d_buffer);
449}
450
451template <typename T>
453 T *d_buffer;
454 gpuMalloc(&(d_buffer), n * sizeof(T));
455 return d_buffer;
456}
457
458#else
459
460template <typename T>
461void field_storage<T>::free_mpi_buffer(T *buffer) {
462 std::free(buffer);
463}
464
465template <typename T>
467 return (T *)memalloc(n * sizeof(T));
468}
469
470#endif
471
472#else // now HILAPP
473
474// Hilapp requires a dummy implementation for functions with auto return type
475template <typename T>
476auto field_storage<T>::get(const unsigned i, const unsigned field_alloc_size) const {
477 T value;
478 return value;
479}
480template <typename T>
481auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
482 T value;
483 return value;
484}
485
486#endif // !HILAPP .. HILAPP
487
488#endif
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
#define RESTRICT
Definition defs.h:51
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
Definition params.h:177
#define N_threads
General number of threads in a thread block.
Definition params.h:189
Information necessary to communicate with a node.
Definition lattice.h:134