5#include "../field_storage.h"
11 gpuMalloc(&fieldbuf,
sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf ==
nullptr) {
13 std::cout <<
"Failure in field memory allocation\n";
15 assert(fieldbuf !=
nullptr);
20 if (fieldbuf !=
nullptr) {
34 const unsigned field_alloc_size)
const {
35 assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
39 base_t *value_f = (base_t *)&value;
40 base_t *fp = (base_t *)(fieldbuf);
41 for (
unsigned e = 0; e < n_elements; e++) {
42 value_f[e] = fp[e * field_alloc_size + i];
50 const unsigned field_alloc_size) {
51 assert(i < field_alloc_size);
52 using base_t = hila::arithmetic_type<T>;
53 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
54 const base_t *value_f = (base_t *)&value;
55 base_t *fp = (base_t *)(fieldbuf);
56 for (
unsigned e = 0; e < n_elements; e++) {
57 fp[e * field_alloc_size + i] = value_f[e];
64__global__
void get_element_kernel(
field_storage<T> field,
char *buffer,
unsigned i,
65 const unsigned field_alloc_size) {
66 *((T *)buffer) = field.get(i, field_alloc_size);
75 gpuMalloc(&(d_buffer),
sizeof(T));
76 get_element_kernel<<<1, 1>>>(*
this, d_buffer, i, lattice.field_alloc_size());
79 gpuMemcpy((
char *)(&value), d_buffer,
sizeof(T), gpuMemcpyDeviceToHost);
86__global__
void set_element_kernel(
field_storage<T> field, T value,
unsigned i,
87 const unsigned field_alloc_size) {
88 field.set(value, i, field_alloc_size);
105 set_element_kernel<<<1, 1>>>(*
this, t_value, i, lattice.field_alloc_size());
110__global__
void gather_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
111 const int n,
const unsigned field_alloc_size) {
112 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
114 buffer[Index] = field.get(site_index[Index], field_alloc_size);
122 unsigned *d_site_index;
126 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
127 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
130 gpuMalloc(&(d_buffer), n *
sizeof(T));
131 int N_blocks = n / N_threads + 1;
132 gather_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
133 lattice.field_alloc_size());
136 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
138 gpuFree(d_site_index);
144template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
145__global__
void gather_elements_negated_kernel(
field_storage<T> field, T *buffer,
146 unsigned *site_index,
const int n,
147 const unsigned field_alloc_size) {
148 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
150 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
158 const unsigned *
RESTRICT index_list,
int n,
160 unsigned *d_site_index;
163 if constexpr (!hila::has_unary_minus<T>::value) {
164 assert(
sizeof(T) < 0 &&
"Unary 'operatur- ()' must be defined for Field variable "
165 "for antiperiodic b.c.");
169 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
170 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
173 gpuMalloc(&(d_buffer), n *
sizeof(T));
174 int N_blocks = n / N_threads + 1;
175 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
176 lattice.field_alloc_size());
179 gpuMemcpy(buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
181 gpuFree(d_site_index);
187__global__
void gather_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
188 const int n,
const unsigned field_alloc_size) {
189 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
191 using base_t = hila::arithmetic_type<T>;
192 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
193 T element = field.get(site_index[Index], field_alloc_size);
194 base_t *ep = (base_t *)&element;
195 base_t *fp = (base_t *)(buffer);
196 for (
unsigned e = 0; e < n_elements; e++) {
197 fp[Index + n * e] = ep[e];
203template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
204__global__
void gather_comm_elements_negated_kernel(
field_storage<T> field, T *buffer,
205 unsigned *site_index,
const int n,
206 const unsigned field_alloc_size) {
207 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
209 using base_t = hila::arithmetic_type<T>;
210 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
211 T element = -field.get(site_index[Index], field_alloc_size);
212 base_t *ep = (base_t *)&element;
213 base_t *fp = (base_t *)(buffer);
214 for (
unsigned e = 0; e < n_elements; e++) {
215 fp[Index + n * e] = ep[e];
221struct cuda_comm_node_struct {
222 const unsigned *cpu_index;
229 static std::vector<struct cuda_comm_node_struct> comm_nodes;
231 const unsigned *cpu_index = to_node.get_sitelist(par, n);
232 for (
struct cuda_comm_node_struct comm_node : comm_nodes) {
233 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
234 return comm_node.gpu_index;
237 struct cuda_comm_node_struct comm_node;
238 comm_node.cpu_index = cpu_index;
240 gpuMalloc(&(comm_node.gpu_index), n *
sizeof(
unsigned));
241 gpuMemcpy(comm_node.gpu_index, cpu_index, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
242 comm_nodes.push_back(comm_node);
243 return comm_node.gpu_index;
252 bool antiperiodic)
const {
254 unsigned *d_site_index = get_site_index(to_node, par, n);
262 gpuMalloc(&(d_buffer), n *
sizeof(T));
266 int N_blocks = n / N_threads + 1;
269 if constexpr (hila::has_unary_minus<T>::value) {
270 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
271 *
this, d_buffer, d_site_index, n, lattice.field_alloc_size());
275 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
276 lattice.field_alloc_size());
281 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
288__global__
void place_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
289 const int n,
const unsigned field_alloc_size) {
290 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
292 field.set(buffer[Index], site_index[Index], field_alloc_size);
300 unsigned *d_site_index;
304 gpuMalloc(&(d_buffer), n *
sizeof(T));
305 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
308 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
309 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
312 int N_blocks = n / N_threads + 1;
313 place_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
314 lattice.field_alloc_size());
317 gpuFree(d_site_index);
320template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
321__global__
void set_local_boundary_elements_kernel(
field_storage<T> field,
unsigned offset,
322 unsigned *site_index,
const int n,
323 const unsigned field_alloc_size) {
324 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
327 value = -field.get(site_index[Index], field_alloc_size);
328 field.set(value, offset + Index, field_alloc_size);
337#ifdef SPECIAL_BOUNDARY_CONDITIONS
339 if constexpr (hila::has_unary_minus<T>::value) {
340 unsigned n, start = 0;
342 n = lattice.special_boundaries[dir].n_odd;
343 start = lattice.special_boundaries[dir].n_even;
346 n = lattice.special_boundaries[dir].n_even;
348 n = lattice.special_boundaries[dir].n_total;
350 unsigned offset = lattice.special_boundaries[dir].offset + start;
352 unsigned *d_site_index;
353 check_device_error(
"earlier");
354 gpuMalloc(&d_site_index, n *
sizeof(
unsigned));
355 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
356 n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
358 unsigned N_blocks = n / N_threads + 1;
359 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
360 *
this, offset, d_site_index, n, lattice.field_alloc_size());
362 gpuFree(d_site_index);
364 assert(
"Antiperiodic b.c. cannot be used with unsigned field elements");
369 assert(!antiperiodic &&
"antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
375__global__
void place_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned offset,
376 const int n,
const unsigned field_alloc_size) {
377 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
379 using base_t = hila::arithmetic_type<T>;
380 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
382 base_t *ep = (base_t *)&element;
383 base_t *fp = (base_t *)(buffer);
384 for (
unsigned e = 0; e < n_elements; e++) {
385 ep[e] = fp[Index + n * e];
387 field.set(element, offset + Index, field_alloc_size);
397 unsigned n = from_node.n_sites(par);
405 gpuMalloc(&(d_buffer), n *
sizeof(T));
406 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
409 unsigned N_blocks = n / N_threads + 1;
410 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
411 lattice.field_alloc_size());
428 gpuMalloc(&(d_buffer), n *
sizeof(T));
441 return (T *)memalloc(n *
sizeof(T));
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements_negated without CUDA aware MPI.
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Information necessary to communicate with a node.