HILA
Loading...
Searching...
No Matches
backend_gpu/field_storage_backend.h
1#ifndef GPU_BACKEND_H
2#define GPU_BACKEND_H
3
4#include "../defs.h"
5#include "../field_storage.h"
6
7/* CUDA / HIP implementations */
8template <typename T>
10 // Allocate space for the field of the device
11 gpuMalloc(&fieldbuf, sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf == nullptr) {
13 std::cout << "Failure in field memory allocation\n";
14 }
15 assert(fieldbuf != nullptr);
16}
17
18template <typename T>
20 if (fieldbuf != nullptr) {
21 gpuFree(fieldbuf);
22 }
23 fieldbuf = nullptr;
24}
25
26// Only attempt to compile with CUDA compiler.
27// Hilapp will skip these.
28// #if defined(__CUDACC__) || defined(__HIPCC__)
29#if !defined(HILAPP)
30
31// These are used in device code. Can be called directly in a kernel.
32template <typename T>
33__device__ inline auto field_storage<T>::get(const unsigned i,
34 const unsigned field_alloc_size) const {
35 // assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
38 union {
39 T value;
40 base_t arr[n_elements];
41 } u;
42 const base_t *fp = (base_t *)(fieldbuf);
43 for (unsigned e = 0; e < n_elements; e++) {
44 u.arr[e] = fp[e * field_alloc_size + i];
45 }
46 return u.value;
47
48 // T value;
49 // base_t *value_f = (base_t *)&value;
50 // base_t *fp = (base_t *)(fieldbuf);
51 // for (unsigned e = 0; e < n_elements; e++) {
52 // value_f[e] = fp[e * field_alloc_size + i];
53 // }
54 // return value;
55}
56
57template <typename T>
58// template <typename A>
59__device__ inline void field_storage<T>::set(const T &value, const unsigned i,
60 const unsigned field_alloc_size) {
61 // assert(i < field_alloc_size);
62 using base_t = hila::arithmetic_type<T>;
63 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
64
65 const base_t *value_f = (base_t *)&value;
66 base_t *fp = (base_t *)(fieldbuf);
67 for (unsigned e = 0; e < n_elements; e++) {
68 fp[e * field_alloc_size + i] = value_f[e];
69 }
70}
71
72/// Get a single element from the field outside a loop. Slow, should only be used for
73/// setup
74template <typename T>
75__global__ void get_element_kernel(field_storage<T> field, char *buffer, unsigned i,
76 const unsigned field_alloc_size) {
77 *((T *)buffer) = field.get(i, field_alloc_size);
78}
79
80template <typename T>
81auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
82 char *d_buffer;
83 T value;
84
85 // Call the kernel to collect the element
86 gpuMalloc(&(d_buffer), sizeof(T));
87 get_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
88
89 // Copy the result to the host
90 gpuMemcpy((char *)(&value), d_buffer, sizeof(T), gpuMemcpyDeviceToHost);
91 gpuFree(d_buffer);
92 return value;
93}
94
95/// Set a single element from outside a loop. Slow, should only be used for setup
96template <typename T>
97__global__ void set_element_kernel(field_storage<T> field, T value, unsigned i,
98 const unsigned field_alloc_size) {
99 field.set(value, i, field_alloc_size);
100}
101
102template <typename T>
103__global__ void set_element_kernel_ptr(field_storage<T> field, const T *buf, unsigned i,
104 const unsigned field_alloc_size) {
105 field.set(*buf, i, field_alloc_size);
106}
107
108
109template <typename T>
110template <typename A>
111void field_storage<T>::set_element(A &value, const unsigned i, const lattice_struct &lattice) {
112 T t_value = value;
113
114 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE) {
115 // pass element directly as arg
116 set_element_kernel<<<1, 1>>>(*this, t_value, i, lattice.field_alloc_size());
117 } else {
118 // This branch is used for large variables:
119 // allocate space and copy the buffer to the device
120 T *d_buffer;
121 gpuMalloc(&(d_buffer), sizeof(T));
122 gpuMemcpy(d_buffer, (char *)&t_value, sizeof(T), gpuMemcpyHostToDevice);
123
124 // call the kernel to set correct indexes
125 set_element_kernel_ptr<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
126 gpuFree(d_buffer);
127 }
128}
129
130/// A kernel that gathers elements
131template <typename T>
132__global__ void gather_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
133 const int n, const unsigned field_alloc_size) {
134 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
135 if (Index < n) {
136 buffer[Index] = field.get(site_index[Index], field_alloc_size);
137 }
138}
139
140/// CUDA implementation of gather_elements without CUDA aware MPI
141template <typename T>
142void field_storage<T>::gather_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
143 int n, const lattice_struct &lattice) const {
144 unsigned *d_site_index;
145 T *d_buffer;
146
147 // Copy the list of boundary site indexes to the device
148 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
149 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
150
151 // Call the kernel to build the list of elements
152 gpuMalloc(&(d_buffer), n * sizeof(T));
153 int N_blocks = n / N_threads + 1;
154 gather_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
155 lattice.field_alloc_size());
156
157 // Copy the result to the host
158 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
159
160 gpuFree(d_site_index);
161 gpuFree(d_buffer);
162}
163
164#ifdef SPECIAL_BOUNDARY_CONDITIONS
165
166/// A kernel that gathers elements negated
167// requires unary -
168template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
169__global__ void gather_elements_negated_kernel(field_storage<T> field, T *buffer,
170 unsigned *site_index, const int n,
171 const unsigned field_alloc_size) {
172 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
173 if (Index < n) {
174 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
175 }
176}
177
178/// CUDA implementation of gather_elements_negated without CUDA aware MPI
179// The only difference to the above is the kernel that is called
180template <typename T>
182 const unsigned *RESTRICT index_list, int n,
183 const lattice_struct &lattice) const {
184 unsigned *d_site_index;
185 T *d_buffer;
186
187 if constexpr (!hila::has_unary_minus<T>::value) {
188 assert(sizeof(T) < 0 && "Unary 'operatur- ()' must be defined for Field variable "
189 "for antiperiodic b.c.");
190 } else {
191
192 // Copy the list of boundary site indexes to the device
193 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
194 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
195
196 // Call the kernel to build the list of elements
197 gpuMalloc(&(d_buffer), n * sizeof(T));
198 int N_blocks = n / N_threads + 1;
199 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
200 lattice.field_alloc_size());
201
202 // Copy the result to the host
203 gpuMemcpy(buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
204
205 gpuFree(d_site_index);
206 gpuFree(d_buffer);
207 }
208}
209
210#endif
211
212template <typename T>
213__global__ void gather_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
214 const int n, const unsigned field_alloc_size) {
215 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
216 if (Index < n) {
217 using base_t = hila::arithmetic_type<T>;
218 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
219 T element = field.get(site_index[Index], field_alloc_size);
220 base_t *ep = (base_t *)&element;
221 base_t *fp = (base_t *)(buffer);
222 for (unsigned e = 0; e < n_elements; e++) {
223 fp[Index + n * e] = ep[e];
224 }
225 }
226}
227
228// gather -elements only if unary - exists
229template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
230__global__ void gather_comm_elements_negated_kernel(field_storage<T> field, T *buffer,
231 unsigned *site_index, const int n,
232 const unsigned field_alloc_size) {
233 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
234 if (Index < n) {
235 using base_t = hila::arithmetic_type<T>;
236 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
237 T element = -field.get(site_index[Index], field_alloc_size);
238 base_t *ep = (base_t *)&element;
239 base_t *fp = (base_t *)(buffer);
240 for (unsigned e = 0; e < n_elements; e++) {
241 fp[Index + n * e] = ep[e];
242 }
243 }
244}
245
246// Index list is constant? Map each cpu pointer to a device pointer and copy just once
247struct cuda_comm_node_struct {
248 const unsigned *cpu_index;
249 unsigned *gpu_index;
250 int n;
251};
252
253inline unsigned *get_site_index(const lattice_struct::comm_node_struct &to_node, Parity par,
254 int &n) {
255 static std::vector<struct cuda_comm_node_struct> comm_nodes;
256
257 const unsigned *cpu_index = to_node.get_sitelist(par, n);
258 for (struct cuda_comm_node_struct comm_node : comm_nodes) {
259 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
260 return comm_node.gpu_index;
261 }
262 }
263 struct cuda_comm_node_struct comm_node;
264 comm_node.cpu_index = cpu_index;
265 comm_node.n = n;
266 gpuMalloc(&(comm_node.gpu_index), n * sizeof(unsigned));
267 gpuMemcpy(comm_node.gpu_index, cpu_index, n * sizeof(unsigned), gpuMemcpyHostToDevice);
268 comm_nodes.push_back(comm_node);
269 return comm_node.gpu_index;
270}
271
272// MPI buffer on the device. Use the gather_elements and gather_elements_negated
273// kernels to fill the buffer.
274template <typename T>
277 Parity par, const lattice_struct &lattice,
278 bool antiperiodic) const {
279 int n;
280 unsigned *d_site_index = get_site_index(to_node, par, n);
281 T *d_buffer;
282
283#ifdef GPU_AWARE_MPI
284 // The buffer is already on the device
285 d_buffer = buffer;
286#else
287 // Allocate a buffer on the device
288 gpuMalloc(&(d_buffer), n * sizeof(T));
289#endif
290
291 // Call the kernel to build the list of elements
292 int N_blocks = n / N_threads + 1;
293#ifdef SPECIAL_BOUNDARY_CONDITIONS
294 if (antiperiodic) {
295
296 if constexpr (hila::has_unary_minus<T>::value) {
297 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
298 *this, d_buffer, d_site_index, n, lattice.field_alloc_size());
299 }
300
301 } else {
302 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
303 lattice.field_alloc_size());
304 }
305#else
306 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
307 lattice.field_alloc_size());
308#endif
309
310#ifndef GPU_AWARE_MPI
311 // Copy the result to the host
312 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
313 gpuFree(d_buffer);
314#endif
315}
316
317/// A kernel that scatters the elements
318template <typename T>
319__global__ void place_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
320 const int n, const unsigned field_alloc_size) {
321 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
322 if (Index < n) {
323 field.set(buffer[Index], site_index[Index], field_alloc_size);
324 }
325}
326
327/// CUDA implementation of place_elements without CUDA aware MPI
328template <typename T>
329void field_storage<T>::place_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
330 int n, const lattice_struct &lattice) {
331 unsigned *d_site_index;
332 T *d_buffer;
333
334 // Allocate space and copy the buffer to the device
335 gpuMalloc(&(d_buffer), n * sizeof(T));
336 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
337
338 // Copy the list of boundary site indexes to the device
339 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
340 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
341
342 // Call the kernel to place the elements
343 int N_blocks = n / N_threads + 1;
344 place_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
345 lattice.field_alloc_size());
346
347 gpuFree(d_buffer);
348 gpuFree(d_site_index);
349}
350
351template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
352__global__ void set_local_boundary_elements_kernel(field_storage<T> field, unsigned offset,
353 unsigned *site_index, const int n,
354 const unsigned field_alloc_size) {
355 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
356 if (Index < n) {
357 T value;
358 value = -field.get(site_index[Index], field_alloc_size);
359 field.set(value, offset + Index, field_alloc_size);
360 }
361}
362
363template <typename T>
365 const lattice_struct &lattice,
366 bool antiperiodic) {
367 // Only need to do something for antiperiodic boundaries
368#ifdef SPECIAL_BOUNDARY_CONDITIONS
369 if (antiperiodic) {
370 if constexpr (hila::has_unary_minus<T>::value) {
371 unsigned n, start = 0;
372 if (par == ODD) {
373 n = lattice.special_boundaries[dir].n_odd;
374 start = lattice.special_boundaries[dir].n_even;
375 } else {
376 if (par == EVEN)
377 n = lattice.special_boundaries[dir].n_even;
378 else
379 n = lattice.special_boundaries[dir].n_total;
380 }
381 unsigned offset = lattice.special_boundaries[dir].offset + start;
382
383 unsigned *d_site_index;
384 check_device_error("earlier");
385 gpuMalloc(&d_site_index, n * sizeof(unsigned));
386 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
387 n * sizeof(unsigned), gpuMemcpyHostToDevice);
388
389 unsigned N_blocks = n / N_threads + 1;
390 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
391 *this, offset, d_site_index, n, lattice.field_alloc_size());
392
393 gpuFree(d_site_index);
394 } else {
395 assert("Antiperiodic b.c. cannot be used with unsigned field elements");
396 }
397 }
398
399#else
400 assert(!antiperiodic && "antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
401#endif
402}
403
404// Place communicated elements to the field array
405template <typename T>
406__global__ void place_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned offset,
407 const int n, const unsigned field_alloc_size) {
408 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
409 if (Index < n) {
410 using base_t = hila::arithmetic_type<T>;
411 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
412 T element;
413 base_t *ep = (base_t *)&element;
414 base_t *fp = (base_t *)(buffer);
415 for (unsigned e = 0; e < n_elements; e++) {
416 ep[e] = fp[Index + n * e];
417 }
418 field.set(element, offset + Index, field_alloc_size);
419 }
420}
421
422// Standard MPI, buffer is on the cpu and needs to be copied accross
423template <typename T>
425 const lattice_struct::comm_node_struct &from_node,
426 const lattice_struct &lattice) {
427
428 unsigned n = from_node.n_sites(par);
429 T *d_buffer;
430
431#ifdef GPU_AWARE_MPI
432 // MPI buffer is on device
433 d_buffer = buffer;
434#else
435 // Allocate space and copy the buffer to the device
436 gpuMalloc(&(d_buffer), n * sizeof(T));
437 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
438#endif
439
440 unsigned N_blocks = n / N_threads + 1;
441 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
442 lattice.field_alloc_size());
443
444#ifndef GPU_AWARE_MPI
445 gpuFree(d_buffer);
446#endif
447}
448
449#ifdef GPU_AWARE_MPI
450
451template <typename T>
452void field_storage<T>::free_mpi_buffer(T *d_buffer) {
453 gpuFree(d_buffer);
454}
455
456template <typename T>
458 T *d_buffer;
459 gpuMalloc(&(d_buffer), n * sizeof(T));
460 return d_buffer;
461}
462
463#else
464
465template <typename T>
466void field_storage<T>::free_mpi_buffer(T *buffer) {
467 std::free(buffer);
468}
469
470template <typename T>
472 return (T *)memalloc(n * sizeof(T));
473}
474
475#endif
476
477#else // now HILAPP
478
479// Hilapp requires a dummy implementation for functions with auto return type
480template <typename T>
481auto field_storage<T>::get(const unsigned i, const unsigned field_alloc_size) const {
482 T value;
483 return value;
484}
485template <typename T>
486auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
487 T value;
488 return value;
489}
490
491#endif // !HILAPP .. HILAPP
492
493#endif
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
#define RESTRICT
Definition defs.h:51
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
Definition params.h:178
#define N_threads
General number of threads in a thread block.
Definition params.h:190
Information necessary to communicate with a node.
Definition lattice.h:134