HILA
Loading...
Searching...
No Matches
backend_gpu/field_storage_backend.h
1#ifndef GPU_BACKEND_H
2#define GPU_BACKEND_H
3
4#include "../defs.h"
5#include "../field_storage.h"
6
7/* CUDA / HIP implementations */
8template <typename T>
10 // Allocate space for the field of the device
11 gpuMalloc(&fieldbuf, sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf == nullptr) {
13 std::cout << "Failure in field memory allocation\n";
14 }
15 assert(fieldbuf != nullptr);
16}
17
18template <typename T>
20 if (fieldbuf != nullptr) {
21 gpuFree(fieldbuf);
22 }
23 fieldbuf = nullptr;
24}
25
26// Only attempt to compile with CUDA compiler.
27// Hilapp will skip these.
28// #if defined(__CUDACC__) || defined(__HIPCC__)
29#if !defined(HILAPP)
30
31// These are used in device code. Can be called directly in a kernel.
32template <typename T>
33__device__ inline auto field_storage<T>::get(const unsigned i,
34 const unsigned field_alloc_size) const {
35 assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
38 T value;
39 base_t *value_f = (base_t *)&value;
40 base_t *fp = (base_t *)(fieldbuf);
41 for (unsigned e = 0; e < n_elements; e++) {
42 value_f[e] = fp[e * field_alloc_size + i];
43 }
44 return value;
45}
46
47template <typename T>
48// template <typename A>
49__device__ inline void field_storage<T>::set(const T &value, const unsigned i,
50 const unsigned field_alloc_size) {
51 assert(i < field_alloc_size);
52 using base_t = hila::arithmetic_type<T>;
53 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
54 const base_t *value_f = (base_t *)&value;
55 base_t *fp = (base_t *)(fieldbuf);
56 for (unsigned e = 0; e < n_elements; e++) {
57 fp[e * field_alloc_size + i] = value_f[e];
58 }
59}
60
61/// Get a single element from the field outside a loop. Slow, should only be used for
62/// setup
63template <typename T>
64__global__ void get_element_kernel(field_storage<T> field, char *buffer, unsigned i,
65 const unsigned field_alloc_size) {
66 *((T *)buffer) = field.get(i, field_alloc_size);
67}
68
69template <typename T>
70auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
71 char *d_buffer;
72 T value;
73
74 // Call the kernel to collect the element
75 gpuMalloc(&(d_buffer), sizeof(T));
76 get_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
77
78 // Copy the result to the host
79 gpuMemcpy((char *)(&value), d_buffer, sizeof(T), gpuMemcpyDeviceToHost);
80 gpuFree(d_buffer);
81 return value;
82}
83
84/// Set a single element from outside a loop. Slow, should only be used for setup
85template <typename T>
86__global__ void set_element_kernel(field_storage<T> field, T value, unsigned i,
87 const unsigned field_alloc_size) {
88 field.set(value, i, field_alloc_size);
89}
90
91template <typename T>
92template <typename A>
93void field_storage<T>::set_element(A &value, const unsigned i, const lattice_struct &lattice) {
94 char *d_buffer;
95 T t_value = value;
96
97 // Allocate space and copy the buffer to the device
98 // gpuMalloc(&(d_buffer), sizeof(T));
99 // gpuMemcpy(d_buffer, (char *)&t_value, sizeof(T), gpuMemcpyHostToDevice);
100
101 // call the kernel to set correct indexes
102 // set_element_kernel<<<1, 1>>>(*this, d_buffer, i, lattice.field_alloc_size());
103 // gpuFree(d_buffer);
104
105 set_element_kernel<<<1, 1>>>(*this, t_value, i, lattice.field_alloc_size());
106}
107
108/// A kernel that gathers elements
109template <typename T>
110__global__ void gather_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
111 const int n, const unsigned field_alloc_size) {
112 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
113 if (Index < n) {
114 buffer[Index] = field.get(site_index[Index], field_alloc_size);
115 }
116}
117
118/// CUDA implementation of gather_elements without CUDA aware MPI
119template <typename T>
120void field_storage<T>::gather_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
121 int n, const lattice_struct &lattice) const {
122 unsigned *d_site_index;
123 T *d_buffer;
124
125 // Copy the list of boundary site indexes to the device
126 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
127 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
128
129 // Call the kernel to build the list of elements
130 gpuMalloc(&(d_buffer), n * sizeof(T));
131 int N_blocks = n / N_threads + 1;
132 gather_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
133 lattice.field_alloc_size());
134
135 // Copy the result to the host
136 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
137
138 gpuFree(d_site_index);
139 gpuFree(d_buffer);
140}
141
142/// A kernel that gathers elements negated
143// requires unary -
144template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
145__global__ void gather_elements_negated_kernel(field_storage<T> field, T *buffer,
146 unsigned *site_index, const int n,
147 const unsigned field_alloc_size) {
148 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
149 if (Index < n) {
150 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
151 }
152}
153
154/// CUDA implementation of gather_elements_negated without CUDA aware MPI
155// The only difference to the above is the kernel that is called
156template <typename T>
158 const unsigned *RESTRICT index_list, int n,
159 const lattice_struct &lattice) const {
160 unsigned *d_site_index;
161 T *d_buffer;
162
163 if constexpr (!hila::has_unary_minus<T>::value) {
164 assert(sizeof(T) < 0 && "Unary 'operatur- ()' must be defined for Field variable "
165 "for antiperiodic b.c.");
166 } else {
167
168 // Copy the list of boundary site indexes to the device
169 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
170 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
171
172 // Call the kernel to build the list of elements
173 gpuMalloc(&(d_buffer), n * sizeof(T));
174 int N_blocks = n / N_threads + 1;
175 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
176 lattice.field_alloc_size());
177
178 // Copy the result to the host
179 gpuMemcpy(buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
180
181 gpuFree(d_site_index);
182 gpuFree(d_buffer);
183 }
184}
185
186template <typename T>
187__global__ void gather_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
188 const int n, const unsigned field_alloc_size) {
189 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
190 if (Index < n) {
191 using base_t = hila::arithmetic_type<T>;
192 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
193 T element = field.get(site_index[Index], field_alloc_size);
194 base_t *ep = (base_t *)&element;
195 base_t *fp = (base_t *)(buffer);
196 for (unsigned e = 0; e < n_elements; e++) {
197 fp[Index + n * e] = ep[e];
198 }
199 }
200}
201
202// gather -elements only if unary - exists
203template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
204__global__ void gather_comm_elements_negated_kernel(field_storage<T> field, T *buffer,
205 unsigned *site_index, const int n,
206 const unsigned field_alloc_size) {
207 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
208 if (Index < n) {
209 using base_t = hila::arithmetic_type<T>;
210 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
211 T element = -field.get(site_index[Index], field_alloc_size);
212 base_t *ep = (base_t *)&element;
213 base_t *fp = (base_t *)(buffer);
214 for (unsigned e = 0; e < n_elements; e++) {
215 fp[Index + n * e] = ep[e];
216 }
217 }
218}
219
220// Index list is constant? Map each cpu pointer to a device pointer and copy just once
221struct cuda_comm_node_struct {
222 const unsigned *cpu_index;
223 unsigned *gpu_index;
224 int n;
225};
226
227inline unsigned *get_site_index(const lattice_struct::comm_node_struct &to_node, Parity par,
228 int &n) {
229 static std::vector<struct cuda_comm_node_struct> comm_nodes;
230
231 const unsigned *cpu_index = to_node.get_sitelist(par, n);
232 for (struct cuda_comm_node_struct comm_node : comm_nodes) {
233 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
234 return comm_node.gpu_index;
235 }
236 }
237 struct cuda_comm_node_struct comm_node;
238 comm_node.cpu_index = cpu_index;
239 comm_node.n = n;
240 gpuMalloc(&(comm_node.gpu_index), n * sizeof(unsigned));
241 gpuMemcpy(comm_node.gpu_index, cpu_index, n * sizeof(unsigned), gpuMemcpyHostToDevice);
242 comm_nodes.push_back(comm_node);
243 return comm_node.gpu_index;
244}
245
246// MPI buffer on the device. Use the gather_elements and gather_elements_negated
247// kernels to fill the buffer.
248template <typename T>
251 Parity par, const lattice_struct &lattice,
252 bool antiperiodic) const {
253 int n;
254 unsigned *d_site_index = get_site_index(to_node, par, n);
255 T *d_buffer;
256
257#ifdef GPU_AWARE_MPI
258 // The buffer is already on the device
259 d_buffer = buffer;
260#else
261 // Allocate a buffer on the device
262 gpuMalloc(&(d_buffer), n * sizeof(T));
263#endif
264
265 // Call the kernel to build the list of elements
266 int N_blocks = n / N_threads + 1;
267 if (antiperiodic) {
268
269 if constexpr (hila::has_unary_minus<T>::value) {
270 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
271 *this, d_buffer, d_site_index, n, lattice.field_alloc_size());
272 }
273
274 } else {
275 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
276 lattice.field_alloc_size());
277 }
278
279#ifndef GPU_AWARE_MPI
280 // Copy the result to the host
281 gpuMemcpy((char *)buffer, d_buffer, n * sizeof(T), gpuMemcpyDeviceToHost);
282 gpuFree(d_buffer);
283#endif
284}
285
286/// A kernel that scatters the elements
287template <typename T>
288__global__ void place_elements_kernel(field_storage<T> field, T *buffer, unsigned *site_index,
289 const int n, const unsigned field_alloc_size) {
290 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
291 if (Index < n) {
292 field.set(buffer[Index], site_index[Index], field_alloc_size);
293 }
294}
295
296/// CUDA implementation of place_elements without CUDA aware MPI
297template <typename T>
298void field_storage<T>::place_elements(T *RESTRICT buffer, const unsigned *RESTRICT index_list,
299 int n, const lattice_struct &lattice) {
300 unsigned *d_site_index;
301 T *d_buffer;
302
303 // Allocate space and copy the buffer to the device
304 gpuMalloc(&(d_buffer), n * sizeof(T));
305 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
306
307 // Copy the list of boundary site indexes to the device
308 gpuMalloc(&(d_site_index), n * sizeof(unsigned));
309 gpuMemcpy(d_site_index, index_list, n * sizeof(unsigned), gpuMemcpyHostToDevice);
310
311 // Call the kernel to place the elements
312 int N_blocks = n / N_threads + 1;
313 place_elements_kernel<<<N_blocks, N_threads>>>(*this, d_buffer, d_site_index, n,
314 lattice.field_alloc_size());
315
316 gpuFree(d_buffer);
317 gpuFree(d_site_index);
318}
319
320template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value, int> = 0>
321__global__ void set_local_boundary_elements_kernel(field_storage<T> field, unsigned offset,
322 unsigned *site_index, const int n,
323 const unsigned field_alloc_size) {
324 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
325 if (Index < n) {
326 T value;
327 value = -field.get(site_index[Index], field_alloc_size);
328 field.set(value, offset + Index, field_alloc_size);
329 }
330}
331
332template <typename T>
334 const lattice_struct &lattice,
335 bool antiperiodic) {
336 // Only need to do something for antiperiodic boundaries
337#ifdef SPECIAL_BOUNDARY_CONDITIONS
338 if (antiperiodic) {
339 if constexpr (hila::has_unary_minus<T>::value) {
340 unsigned n, start = 0;
341 if (par == ODD) {
342 n = lattice.special_boundaries[dir].n_odd;
343 start = lattice.special_boundaries[dir].n_even;
344 } else {
345 if (par == EVEN)
346 n = lattice.special_boundaries[dir].n_even;
347 else
348 n = lattice.special_boundaries[dir].n_total;
349 }
350 unsigned offset = lattice.special_boundaries[dir].offset + start;
351
352 unsigned *d_site_index;
353 check_device_error("earlier");
354 gpuMalloc(&d_site_index, n * sizeof(unsigned));
355 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
356 n * sizeof(unsigned), gpuMemcpyHostToDevice);
357
358 unsigned N_blocks = n / N_threads + 1;
359 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
360 *this, offset, d_site_index, n, lattice.field_alloc_size());
361
362 gpuFree(d_site_index);
363 } else {
364 assert("Antiperiodic b.c. cannot be used with unsigned field elements");
365 }
366 }
367
368#else
369 assert(!antiperiodic && "antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
370#endif
371}
372
373// Place communicated elements to the field array
374template <typename T>
375__global__ void place_comm_elements_kernel(field_storage<T> field, T *buffer, unsigned offset,
376 const int n, const unsigned field_alloc_size) {
377 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
378 if (Index < n) {
379 using base_t = hila::arithmetic_type<T>;
380 constexpr unsigned n_elements = sizeof(T) / sizeof(base_t);
381 T element;
382 base_t *ep = (base_t *)&element;
383 base_t *fp = (base_t *)(buffer);
384 for (unsigned e = 0; e < n_elements; e++) {
385 ep[e] = fp[Index + n * e];
386 }
387 field.set(element, offset + Index, field_alloc_size);
388 }
389}
390
391// Standard MPI, buffer is on the cpu and needs to be copied accross
392template <typename T>
394 const lattice_struct::comm_node_struct &from_node,
395 const lattice_struct &lattice) {
396
397 unsigned n = from_node.n_sites(par);
398 T *d_buffer;
399
400#ifdef GPU_AWARE_MPI
401 // MPI buffer is on device
402 d_buffer = buffer;
403#else
404 // Allocate space and copy the buffer to the device
405 gpuMalloc(&(d_buffer), n * sizeof(T));
406 gpuMemcpy(d_buffer, buffer, n * sizeof(T), gpuMemcpyHostToDevice);
407#endif
408
409 unsigned N_blocks = n / N_threads + 1;
410 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
411 lattice.field_alloc_size());
412
413#ifndef GPU_AWARE_MPI
414 gpuFree(d_buffer);
415#endif
416}
417
418#ifdef GPU_AWARE_MPI
419
420template <typename T>
421void field_storage<T>::free_mpi_buffer(T *d_buffer) {
422 gpuFree(d_buffer);
423}
424
425template <typename T>
427 T *d_buffer;
428 gpuMalloc(&(d_buffer), n * sizeof(T));
429 return d_buffer;
430}
431
432#else
433
434template <typename T>
435void field_storage<T>::free_mpi_buffer(T *buffer) {
436 std::free(buffer);
437}
438
439template <typename T>
441 return (T *)memalloc(n * sizeof(T));
442}
443
444#endif
445
446#else // now HILAPP
447
448// Hilapp requires a dummy implementation for functions with auto return type
449template <typename T>
450auto field_storage<T>::get(const unsigned i, const unsigned field_alloc_size) const {
451 T value;
452 return value;
453}
454template <typename T>
455auto field_storage<T>::get_element(const unsigned i, const lattice_struct &lattice) const {
456 T value;
457 return value;
458}
459
460#endif // !HILAPP .. HILAPP
461
462#endif
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements_negated without CUDA aware MPI.
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
#define RESTRICT
Definition defs.h:50
Information necessary to communicate with a node.
Definition lattice.h:134