5#include "../field_storage.h"
11 gpuMalloc(&fieldbuf,
sizeof(T) * lattice->mynode.field_alloc_size);
12 if (fieldbuf ==
nullptr) {
13 std::cout <<
"Failure in field memory allocation\n";
15 assert(fieldbuf !=
nullptr);
20 if (fieldbuf !=
nullptr) {
34 const unsigned field_alloc_size)
const {
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
40 base_t arr[n_elements];
42 const base_t *fp = (base_t *)(fieldbuf);
43 for (
unsigned e = 0; e < n_elements; e++) {
44 u.arr[e] = fp[e * field_alloc_size + i];
60 const unsigned field_alloc_size) {
62 using base_t = hila::arithmetic_type<T>;
63 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
65 const base_t *value_f = (base_t *)&value;
66 base_t *fp = (base_t *)(fieldbuf);
67 for (
unsigned e = 0; e < n_elements; e++) {
68 fp[e * field_alloc_size + i] = value_f[e];
75__global__
void get_element_kernel(
field_storage<T> field,
char *buffer,
unsigned i,
76 const unsigned field_alloc_size) {
77 *((T *)buffer) = field.get(i, field_alloc_size);
86 gpuMalloc(&(d_buffer),
sizeof(T));
87 get_element_kernel<<<1, 1>>>(*
this, d_buffer, i, lattice->mynode.field_alloc_size);
90 gpuMemcpy((
char *)(&value), d_buffer,
sizeof(T), gpuMemcpyDeviceToHost);
97__global__
void set_element_kernel(
field_storage<T> field, T value,
unsigned i,
98 const unsigned field_alloc_size) {
99 field.set(value, i, field_alloc_size);
103__global__
void set_element_kernel_ptr(
field_storage<T> field,
const T *buf,
unsigned i,
104 const unsigned field_alloc_size) {
105 field.set(*buf, i, field_alloc_size);
116 set_element_kernel<<<1, 1>>>(*
this, t_value, i, lattice->mynode.field_alloc_size);
121 gpuMalloc(&(d_buffer),
sizeof(T));
122 gpuMemcpy(d_buffer, (
char *)&t_value,
sizeof(T), gpuMemcpyHostToDevice);
125 set_element_kernel_ptr<<<1, 1>>>(*
this, d_buffer, i, lattice->mynode.field_alloc_size);
132__global__
void gather_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
133 const int n,
const unsigned field_alloc_size) {
134 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
136 buffer[Index] = field.get(site_index[Index], field_alloc_size);
143 int n,
const Lattice lattice)
const {
144 unsigned *d_site_index;
148 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
149 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
152 gpuMalloc(&(d_buffer), n *
sizeof(T));
154 gather_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
155 lattice->mynode.field_alloc_size);
158 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
160 gpuFree(d_site_index);
164#ifdef SPECIAL_BOUNDARY_CONDITIONS
168template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
169__global__
void gather_elements_negated_kernel(
field_storage<T> field, T *buffer,
170 unsigned *site_index,
const int n,
171 const unsigned field_alloc_size) {
172 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
174 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
182 const unsigned *
RESTRICT index_list,
int n,
184 unsigned *d_site_index;
188 assert(
sizeof(T) < 0 &&
"Unary 'operatur- ()' must be defined for Field variable "
189 "for antiperiodic b.c.");
193 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
194 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
197 gpuMalloc(&(d_buffer), n *
sizeof(T));
199 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
200 lattice->mynode.field_alloc_size);
203 gpuMemcpy(buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
205 gpuFree(d_site_index);
213__global__
void gather_comm_elements_kernel(
field_storage<T> field, T *buffer,
214 const unsigned *site_index,
const int n,
215 const unsigned field_alloc_size) {
216 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
218 using base_t = hila::arithmetic_type<T>;
219 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
220 T element = field.get(site_index[Index], field_alloc_size);
221 base_t *ep = (base_t *)&element;
222 base_t *fp = (base_t *)(buffer);
223 for (
unsigned e = 0; e < n_elements; e++) {
224 fp[Index + n * e] = ep[e];
230template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
231__global__
void gather_comm_elements_negated_kernel(
field_storage<T> field, T *buffer,
232 const unsigned *site_index,
const int n,
233 const unsigned field_alloc_size) {
234 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
236 using base_t = hila::arithmetic_type<T>;
237 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
238 T element = -field.get(site_index[Index], field_alloc_size);
239 base_t *ep = (base_t *)&element;
240 base_t *fp = (base_t *)(buffer);
241 for (
unsigned e = 0; e < n_elements; e++) {
242 fp[Index + n * e] = ep[e];
254 bool antiperiodic)
const {
256 const unsigned *d_site_index = to_node.get_sitelist(par, n);
264 gpuMalloc(&(d_buffer), n *
sizeof(T));
269#ifdef SPECIAL_BOUNDARY_CONDITIONS
273 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
274 *
this, d_buffer, d_site_index, n, lattice->mynode.field_alloc_size);
278 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
279 lattice->mynode.field_alloc_size);
282 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
283 lattice->mynode.field_alloc_size);
288 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
295__global__
void place_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
296 const int n,
const unsigned field_alloc_size) {
297 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
299 field.set(buffer[Index], site_index[Index], field_alloc_size);
306 int n,
const Lattice lattice) {
307 unsigned *d_site_index;
311 gpuMalloc(&(d_buffer), n *
sizeof(T));
312 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
315 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
316 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
320 place_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
321 lattice->mynode.field_alloc_size);
324 gpuFree(d_site_index);
327template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
328__global__
void set_local_boundary_elements_kernel(
field_storage<T> field,
unsigned offset,
329 unsigned *site_index,
const int n,
330 const unsigned field_alloc_size) {
331 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
334 value = -field.get(site_index[Index], field_alloc_size);
335 field.set(value, offset + Index, field_alloc_size);
344#ifdef SPECIAL_BOUNDARY_CONDITIONS
347 unsigned n, start = 0;
349 n = lattice->special_boundaries[dir].n_odd;
350 start = lattice->special_boundaries[dir].n_even;
353 n = lattice->special_boundaries[dir].n_even;
355 n = lattice->special_boundaries[dir].n_total;
357 unsigned offset = lattice->special_boundaries[dir].offset + start;
359 unsigned *d_site_index;
360 check_device_error(
"earlier");
361 gpuMalloc(&d_site_index, n *
sizeof(
unsigned));
362 gpuMemcpy(d_site_index, lattice->special_boundaries[dir].move_index + start,
363 n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
366 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
367 *
this, offset, d_site_index, n, lattice->mynode.field_alloc_size);
369 gpuFree(d_site_index);
371 assert(
"Antiperiodic b.c. cannot be used with unsigned field elements");
376 assert(!antiperiodic &&
"antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
382__global__
void place_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned offset,
383 const int n,
const unsigned field_alloc_size) {
384 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
386 using base_t = hila::arithmetic_type<T>;
387 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
389 base_t *ep = (base_t *)&element;
390 base_t *fp = (base_t *)(buffer);
391 for (
unsigned e = 0; e < n_elements; e++) {
392 ep[e] = fp[Index + n * e];
394 field.set(element, offset + Index, field_alloc_size);
404 unsigned n = from_node.n_sites(par);
412 gpuMalloc(&(d_buffer), n *
sizeof(T));
413 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
417 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
418 lattice->mynode.field_alloc_size);
435 gpuMalloc(&(d_buffer), n *
sizeof(T));
448 return (T *)memalloc(n *
sizeof(T));
main interface class to lattices.
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
auto get_element(const unsigned i, const Lattice lattice) const
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const Lattice lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const Lattice lattice)
Place boundary elements from neighbour.
void set_local_boundary_elements(Direction dir, Parity par, const Lattice lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
#define N_threads
General number of threads in a thread block.
Information necessary to communicate with a node.