5#include "../field_storage.h"
11 gpuMalloc(&fieldbuf,
sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf ==
nullptr) {
13 std::cout <<
"Failure in field memory allocation\n";
15 assert(fieldbuf !=
nullptr);
20 if (fieldbuf !=
nullptr) {
34 const unsigned field_alloc_size)
const {
35 assert(i < field_alloc_size);
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
39 base_t *value_f = (base_t *)&value;
40 base_t *fp = (base_t *)(fieldbuf);
41 for (
unsigned e = 0; e < n_elements; e++) {
42 value_f[e] = fp[e * field_alloc_size + i];
50 const unsigned field_alloc_size) {
51 assert(i < field_alloc_size);
52 using base_t = hila::arithmetic_type<T>;
53 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
54 const base_t *value_f = (base_t *)&value;
55 base_t *fp = (base_t *)(fieldbuf);
56 for (
unsigned e = 0; e < n_elements; e++) {
57 fp[e * field_alloc_size + i] = value_f[e];
64__global__
void get_element_kernel(
field_storage<T> field,
char *buffer,
unsigned i,
65 const unsigned field_alloc_size) {
66 *((T *)buffer) = field.get(i, field_alloc_size);
75 gpuMalloc(&(d_buffer),
sizeof(T));
76 get_element_kernel<<<1, 1>>>(*
this, d_buffer, i, lattice.field_alloc_size());
79 gpuMemcpy((
char *)(&value), d_buffer,
sizeof(T), gpuMemcpyDeviceToHost);
86__global__
void set_element_kernel(
field_storage<T> field, T value,
unsigned i,
87 const unsigned field_alloc_size) {
88 field.set(value, i, field_alloc_size);
92__global__
void set_element_kernel_ptr(
field_storage<T> field,
const T *buf,
unsigned i,
93 const unsigned field_alloc_size) {
94 field.set(*buf, i, field_alloc_size);
103 if constexpr (
sizeof(T) <= GPU_GLOBAL_ARG_MAX_SIZE) {
105 set_element_kernel<<<1, 1>>>(*
this, t_value, i, lattice.field_alloc_size());
110 gpuMalloc(&(d_buffer),
sizeof(T));
111 gpuMemcpy(d_buffer, (
char *)&t_value,
sizeof(T), gpuMemcpyHostToDevice);
114 set_element_kernel_ptr<<<1, 1>>>(*
this, d_buffer, i, lattice.field_alloc_size());
121__global__
void gather_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
122 const int n,
const unsigned field_alloc_size) {
123 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
125 buffer[Index] = field.get(site_index[Index], field_alloc_size);
133 unsigned *d_site_index;
137 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
138 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
141 gpuMalloc(&(d_buffer), n *
sizeof(T));
142 int N_blocks = n / N_threads + 1;
143 gather_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
144 lattice.field_alloc_size());
147 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
149 gpuFree(d_site_index);
155template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
156__global__
void gather_elements_negated_kernel(
field_storage<T> field, T *buffer,
157 unsigned *site_index,
const int n,
158 const unsigned field_alloc_size) {
159 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
161 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
169 const unsigned *
RESTRICT index_list,
int n,
171 unsigned *d_site_index;
175 assert(
sizeof(T) < 0 &&
"Unary 'operatur- ()' must be defined for Field variable "
176 "for antiperiodic b.c.");
180 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
181 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
184 gpuMalloc(&(d_buffer), n *
sizeof(T));
185 int N_blocks = n / N_threads + 1;
186 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
187 lattice.field_alloc_size());
190 gpuMemcpy(buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
192 gpuFree(d_site_index);
198__global__
void gather_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
199 const int n,
const unsigned field_alloc_size) {
200 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
202 using base_t = hila::arithmetic_type<T>;
203 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
204 T element = field.get(site_index[Index], field_alloc_size);
205 base_t *ep = (base_t *)&element;
206 base_t *fp = (base_t *)(buffer);
207 for (
unsigned e = 0; e < n_elements; e++) {
208 fp[Index + n * e] = ep[e];
214template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
215__global__
void gather_comm_elements_negated_kernel(
field_storage<T> field, T *buffer,
216 unsigned *site_index,
const int n,
217 const unsigned field_alloc_size) {
218 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
220 using base_t = hila::arithmetic_type<T>;
221 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
222 T element = -field.get(site_index[Index], field_alloc_size);
223 base_t *ep = (base_t *)&element;
224 base_t *fp = (base_t *)(buffer);
225 for (
unsigned e = 0; e < n_elements; e++) {
226 fp[Index + n * e] = ep[e];
232struct cuda_comm_node_struct {
233 const unsigned *cpu_index;
240 static std::vector<struct cuda_comm_node_struct> comm_nodes;
242 const unsigned *cpu_index = to_node.get_sitelist(par, n);
243 for (
struct cuda_comm_node_struct comm_node : comm_nodes) {
244 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
245 return comm_node.gpu_index;
248 struct cuda_comm_node_struct comm_node;
249 comm_node.cpu_index = cpu_index;
251 gpuMalloc(&(comm_node.gpu_index), n *
sizeof(
unsigned));
252 gpuMemcpy(comm_node.gpu_index, cpu_index, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
253 comm_nodes.push_back(comm_node);
254 return comm_node.gpu_index;
263 bool antiperiodic)
const {
265 unsigned *d_site_index = get_site_index(to_node, par, n);
273 gpuMalloc(&(d_buffer), n *
sizeof(T));
277 int N_blocks = n / N_threads + 1;
281 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
282 *
this, d_buffer, d_site_index, n, lattice.field_alloc_size());
286 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
287 lattice.field_alloc_size());
292 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
299__global__
void place_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
300 const int n,
const unsigned field_alloc_size) {
301 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
303 field.set(buffer[Index], site_index[Index], field_alloc_size);
311 unsigned *d_site_index;
315 gpuMalloc(&(d_buffer), n *
sizeof(T));
316 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
319 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
320 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
323 int N_blocks = n / N_threads + 1;
324 place_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
325 lattice.field_alloc_size());
328 gpuFree(d_site_index);
331template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
332__global__
void set_local_boundary_elements_kernel(
field_storage<T> field,
unsigned offset,
333 unsigned *site_index,
const int n,
334 const unsigned field_alloc_size) {
335 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
338 value = -field.get(site_index[Index], field_alloc_size);
339 field.set(value, offset + Index, field_alloc_size);
348#ifdef SPECIAL_BOUNDARY_CONDITIONS
351 unsigned n, start = 0;
353 n = lattice.special_boundaries[dir].n_odd;
354 start = lattice.special_boundaries[dir].n_even;
357 n = lattice.special_boundaries[dir].n_even;
359 n = lattice.special_boundaries[dir].n_total;
361 unsigned offset = lattice.special_boundaries[dir].offset + start;
363 unsigned *d_site_index;
364 check_device_error(
"earlier");
365 gpuMalloc(&d_site_index, n *
sizeof(
unsigned));
366 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
367 n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
369 unsigned N_blocks = n / N_threads + 1;
370 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
371 *
this, offset, d_site_index, n, lattice.field_alloc_size());
373 gpuFree(d_site_index);
375 assert(
"Antiperiodic b.c. cannot be used with unsigned field elements");
380 assert(!antiperiodic &&
"antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
386__global__
void place_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned offset,
387 const int n,
const unsigned field_alloc_size) {
388 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
390 using base_t = hila::arithmetic_type<T>;
391 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
393 base_t *ep = (base_t *)&element;
394 base_t *fp = (base_t *)(buffer);
395 for (
unsigned e = 0; e < n_elements; e++) {
396 ep[e] = fp[Index + n * e];
398 field.set(element, offset + Index, field_alloc_size);
408 unsigned n = from_node.n_sites(par);
416 gpuMalloc(&(d_buffer), n *
sizeof(T));
417 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
420 unsigned N_blocks = n / N_threads + 1;
421 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
422 lattice.field_alloc_size());
439 gpuMalloc(&(d_buffer), n *
sizeof(T));
452 return (T *)memalloc(n *
sizeof(T));
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements_negated without CUDA aware MPI.
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Information necessary to communicate with a node.