HILA
Loading...
Searching...
No Matches
backend_gpu/field_storage_backend.h
1#ifndef GPU_BACKEND_H
2#define GPU_BACKEND_H
3
4#include "../defs.h"
5#include "../field_storage.h"
6
7/* CUDA / HIP implementations */
8template <typename T>
10 // Allocate space for the field of the device
11 gpuMalloc(&fieldbuf, sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf == nullptr) {
13 std::cout << "Failure in field memory allocation\n";
14 }
15 assert(fieldbuf != nullptr);
16}
17
18template <typename T>
20 if (fieldbuf != nullptr) {
21 gpuFree(fieldbuf);
22 }
23 fieldbuf = nullptr;
24}
25
26// Only attempt to compile with CUDA compiler.
27// Hilapp will skip these.
28// #if defined(__CUDACC__) || defined(__HIPCC__)
29#if !defined(HILAPP)
30
31// These are used in device code. Can be called directly in a kernel.
32template <typename T>
33__device__ inline auto field_storage<T>::get(const unsigned i,
34 const unsigned field_alloc_size) const {
35 assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
38 T value;
39 base_t *value_f = (base_t *)&value;
40 base_t *fp = (base_t *)(fieldbuf);
41 for (unsigned e = 0; e < n_elements; e++) {
42 value_f[e] = fp[e * field_alloc_size + i];
43 }
44 return value;
45}
46
47template <typename T>
48// template <typename A>
49__device__ inline void field_storage<T>::set(const T &value, const unsigned i,
50 const unsigned field_alloc_size) {
51 assert(i < field_alloc_size);
52 using base_t = hila::arithmetic_type<T>;
53 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
54 const base_t *value_f = (base_t *)&value;
55 base_t *fp = (base_t *)(fieldbuf);
56 for (unsigned e = 0; e < n_elements; e++) {
57 fp[e * field_alloc_size + i] = value_f[e];
58 }
59}
60
61/// Get a single element from the field outside a loop. Slow, should only be used for
62/// setup
63template <typename T>
64__global__ void get_element_kernel(field_storage<T> field, char *buffer, unsigned i,
65 const unsigned field_alloc_size) {
66 *((T *)buffer) = field.get(i, field_alloc_size);
67}
68
69template <typename T>
70auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
71 char *d_buffer;
72 T value;
73
74 // Call the kernel to collect the element
75 gpuMalloc(&(d_buffer), sizeof(T));
76 get_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
77
78 // Copy the result to the host
79 gpuMemcpy((char *)(&value), d_buffer, sizeof(T), gpuMemcpyDeviceToHost);
80 gpuFree(d_buffer);
81 return value;
82}
83
84/// Set a single element from outside a loop. Slow, should only be used for setup
85template <typename T>
86__global__ void set_element_kernel(field_storage<T> field, T value, unsigned i,
87 const unsigned field_alloc_size) {
88 field.set(value, i, field_alloc_size);
89}
90
91template <typename T>
92__global__ void set_element_kernel_ptr(field_storage<T> field, const T *buf, unsigned i,
93 const unsigned field_alloc_size) {
94 field.set(*buf, i, field_alloc_size);
95}
96
97
98template <typename T>
99template <typename A>
100void field_storage<T>::set_element(A &value, const unsigned i, const lattice_struct &lattice) {
101 T t_value = value;
102
103 if constexpr (sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE) {
104 // pass element directly as arg
105 set_element_kernel<<<1, 1>>>(*this, t_value, i, lattice.field_alloc_size());
106 } else {
107 // This branch is used for large variables:
108 // allocate space and copy the buffer to the device
109 T *d_buffer;
110 gpuMalloc(&(d_buffer), sizeof(T));
111 gpuMemcpy(d_buffer, (char *)&t_value, sizeof(T), gpuMemcpyHostToDevice);
112
113 // call the kernel to set correct indexes
114 set_element_kernel_ptr<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
115 gpuFree(d_buffer);
116 }
117}
118
119/// A kernel that gathers elements
120template <typename T>
121__global__ void gather_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
122 const int n, const unsigned field_alloc_size) {
123 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
124 if (Index < n) {
125 buffer[Index] = field.get(site_index[Index], field_alloc_size);
126 }
127}
128
129/// CUDA implementation of gather_elements without CUDA aware MPI
130template <typename T>
131void field_storage<T>::gather_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
132 int n, const lattice_struct &lattice) const {
133 unsigned *d_site_index;
134 T *d_buffer;
135
136 // Copy the list of boundary site indexes to the device
137 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
138 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
139
140 // Call the kernel to build the list of elements
141 gpuMalloc(&(d_buffer), n * sizeof(T));
142 int N_blocks = n / N_threads + 1;
143 gather_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
144 lattice.field_alloc_size());
145
146 // Copy the result to the host
147 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
148
149 gpuFree(d_site_index);
150 gpuFree(d_buffer);
151}
152
153/// A kernel that gathers elements negated
154// requires unary -
155template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
156__global__ void gather_elements_negated_kernel(field_storage<T> field, T *buffer,
157 unsigned *site_index, const int n,
158 const unsigned field_alloc_size) {
159 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
160 if (Index < n) {
161 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
162 }
163}
164
165/// CUDA implementation of gather_elements_negated without CUDA aware MPI
166// The only difference to the above is the kernel that is called
167template <typename T>
169 const unsigned *RESTRICT index_list, int n,
170 const lattice_struct &lattice) const {
171 unsigned *d_site_index;
172 T *d_buffer;
173
174 if constexpr (!hila::has_unary_minus<T>::value) {
175 assert(sizeof(T) < 0 && "Unary 'operatur- ()' must be defined for Field variable "
176 "for antiperiodic b.c.");
177 } else {
178
179 // Copy the list of boundary site indexes to the device
180 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
181 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
182
183 // Call the kernel to build the list of elements
184 gpuMalloc(&(d_buffer), n * sizeof(T));
185 int N_blocks = n / N_threads + 1;
186 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
187 lattice.field_alloc_size());
188
189 // Copy the result to the host
190 gpuMemcpy(buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
191
192 gpuFree(d_site_index);
193 gpuFree(d_buffer);
194 }
195}
196
197template <typename T>
198__global__ void gather_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
199 const int n, const unsigned field_alloc_size) {
200 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
201 if (Index < n) {
202 using base_t = hila::arithmetic_type<T>;
203 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
204 T element = field.get(site_index[Index], field_alloc_size);
205 base_t *ep = (base_t *)&element;
206 base_t *fp = (base_t *)(buffer);
207 for (unsigned e = 0; e < n_elements; e++) {
208 fp[Index + n * e] = ep[e];
209 }
210 }
211}
212
213// gather -elements only if unary - exists
214template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
215__global__ void gather_comm_elements_negated_kernel(field_storage<T> field, T *buffer,
216 unsigned *site_index, const int n,
217 const unsigned field_alloc_size) {
218 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
219 if (Index < n) {
220 using base_t = hila::arithmetic_type<T>;
221 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
222 T element = -field.get(site_index[Index], field_alloc_size);
223 base_t *ep = (base_t *)&element;
224 base_t *fp = (base_t *)(buffer);
225 for (unsigned e = 0; e < n_elements; e++) {
226 fp[Index + n * e] = ep[e];
227 }
228 }
229}
230
231// Index list is constant? Map each cpu pointer to a device pointer and copy just once
232struct cuda_comm_node_struct {
233 const unsigned *cpu_index;
234 unsigned *gpu_index;
235 int n;
236};
237
238inline unsigned *get_site_index(const lattice_struct::comm_node_struct &to_node, Parity par,
239 int &n) {
240 static std::vector<struct cuda_comm_node_struct> comm_nodes;
241
242 const unsigned *cpu_index = to_node.get_sitelist(par, n);
243 for (struct cuda_comm_node_struct comm_node : comm_nodes) {
244 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
245 return comm_node.gpu_index;
246 }
247 }
248 struct cuda_comm_node_struct comm_node;
249 comm_node.cpu_index = cpu_index;
250 comm_node.n = n;
251 gpuMalloc(&(comm_node.gpu_index), n * sizeof(unsigned));
252 gpuMemcpy(comm_node.gpu_index, cpu_index, n * sizeof(unsigned), gpuMemcpyHostToDevice);
253 comm_nodes.push_back(comm_node);
254 return comm_node.gpu_index;
255}
256
257// MPI buffer on the device. Use the gather_elements and gather_elements_negated
258// kernels to fill the buffer.
259template <typename T>
262 Parity par, const lattice_struct &lattice,
263 bool antiperiodic) const {
264 int n;
265 unsigned *d_site_index = get_site_index(to_node, par, n);
266 T *d_buffer;
267
268#ifdef GPU_AWARE_MPI
269 // The buffer is already on the device
270 d_buffer = buffer;
271#else
272 // Allocate a buffer on the device
273 gpuMalloc(&(d_buffer), n * sizeof(T));
274#endif
275
276 // Call the kernel to build the list of elements
277 int N_blocks = n / N_threads + 1;
278 if (antiperiodic) {
279
280 if constexpr (hila::has_unary_minus<T>::value) {
281 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
282 *this, d_buffer, d_site_index, n, lattice.field_alloc_size());
283 }
284
285 } else {
286 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
287 lattice.field_alloc_size());
288 }
289
290#ifndef GPU_AWARE_MPI
291 // Copy the result to the host
292 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
293 gpuFree(d_buffer);
294#endif
295}
296
297/// A kernel that scatters the elements
298template <typename T>
299__global__ void place_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
300 const int n, const unsigned field_alloc_size) {
301 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
302 if (Index < n) {
303 field.set(buffer[Index], site_index[Index], field_alloc_size);
304 }
305}
306
307/// CUDA implementation of place_elements without CUDA aware MPI
308template <typename T>
309void field_storage<T>::place_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
310 int n, const lattice_struct &lattice) {
311 unsigned *d_site_index;
312 T *d_buffer;
313
314 // Allocate space and copy the buffer to the device
315 gpuMalloc(&(d_buffer), n * sizeof(T));
316 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
317
318 // Copy the list of boundary site indexes to the device
319 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
320 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
321
322 // Call the kernel to place the elements
323 int N_blocks = n / N_threads + 1;
324 place_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
325 lattice.field_alloc_size());
326
327 gpuFree(d_buffer);
328 gpuFree(d_site_index);
329}
330
331template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
332__global__ void set_local_boundary_elements_kernel(field_storage<T> field, unsigned offset,
333 unsigned *site_index, const int n,
334 const unsigned field_alloc_size) {
335 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
336 if (Index < n) {
337 T value;
338 value = -field.get(site_index[Index], field_alloc_size);
339 field.set(value, offset + Index, field_alloc_size);
340 }
341}
342
343template <typename T>
345 const lattice_struct &lattice,
346 bool antiperiodic) {
347 // Only need to do something for antiperiodic boundaries
348#ifdef SPECIAL_BOUNDARY_CONDITIONS
349 if (antiperiodic) {
350 if constexpr (hila::has_unary_minus<T>::value) {
351 unsigned n, start = 0;
352 if (par == ODD) {
353 n = lattice.special_boundaries[dir].n_odd;
354 start = lattice.special_boundaries[dir].n_even;
355 } else {
356 if (par == EVEN)
357 n = lattice.special_boundaries[dir].n_even;
358 else
359 n = lattice.special_boundaries[dir].n_total;
360 }
361 unsigned offset = lattice.special_boundaries[dir].offset + start;
362
363 unsigned *d_site_index;
364 check_device_error("earlier");
365 gpuMalloc(&d_site_index, n * sizeof(unsigned));
366 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
367 n * sizeof(unsigned), gpuMemcpyHostToDevice);
368
369 unsigned N_blocks = n / N_threads + 1;
370 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
371 *this, offset, d_site_index, n, lattice.field_alloc_size());
372
373 gpuFree(d_site_index);
374 } else {
375 assert("Antiperiodic b.c. cannot be used with unsigned field elements");
376 }
377 }
378
379#else
380 assert(!antiperiodic && "antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
381#endif
382}
383
384// Place communicated elements to the field array
385template <typename T>
386__global__ void place_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned offset,
387 const int n, const unsigned field_alloc_size) {
388 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
389 if (Index < n) {
390 using base_t = hila::arithmetic_type<T>;
391 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
392 T element;
393 base_t *ep = (base_t *)&element;
394 base_t *fp = (base_t *)(buffer);
395 for (unsigned e = 0; e < n_elements; e++) {
396 ep[e] = fp[Index + n * e];
397 }
398 field.set(element, offset + Index, field_alloc_size);
399 }
400}
401
402// Standard MPI, buffer is on the cpu and needs to be copied accross
403template <typename T>
405 const lattice_struct::comm_node_struct &from_node,
406 const lattice_struct &lattice) {
407
408 unsigned n = from_node.n_sites(par);
409 T *d_buffer;
410
411#ifdef GPU_AWARE_MPI
412 // MPI buffer is on device
413 d_buffer = buffer;
414#else
415 // Allocate space and copy the buffer to the device
416 gpuMalloc(&(d_buffer), n * sizeof(T));
417 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
418#endif
419
420 unsigned N_blocks = n / N_threads + 1;
421 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
422 lattice.field_alloc_size());
423
424#ifndef GPU_AWARE_MPI
425 gpuFree(d_buffer);
426#endif
427}
428
429#ifdef GPU_AWARE_MPI
430
431template <typename T>
432void field_storage<T>::free_mpi_buffer(T *d_buffer) {
433 gpuFree(d_buffer);
434}
435
436template <typename T>
438 T *d_buffer;
439 gpuMalloc(&(d_buffer), n * sizeof(T));
440 return d_buffer;
441}
442
443#else
444
445template <typename T>
446void field_storage<T>::free_mpi_buffer(T *buffer) {
447 std::free(buffer);
448}
449
450template <typename T>
452 return (T *)memalloc(n * sizeof(T));
453}
454
455#endif
456
457#else // now HILAPP
458
459// Hilapp requires a dummy implementation for functions with auto return type
460template <typename T>
461auto field_storage<T>::get(const unsigned i, const unsigned field_alloc_size) const {
462 T value;
463 return value;
464}
465template <typename T>
466auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
467 T value;
468 return value;
469}
470
471#endif // !HILAPP .. HILAPP
472
473#endif
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements_negated without CUDA aware MPI.
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
#define RESTRICT
Definition defs.h:51
Information necessary to communicate with a node.
Definition lattice.h:134