5#include "../field_storage.h"
11 gpuMalloc(&fieldbuf,
sizeof(T) * lattice.field_alloc_size());
12 if (fieldbuf ==
nullptr) {
13 std::cout <<
"Failure in field memory allocation\n";
15 assert(fieldbuf !=
nullptr);
20 if (fieldbuf !=
nullptr) {
34 const unsigned field_alloc_size)
const {
36 using base_t = hila::arithmetic_type<T>;
37 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
40 base_t arr[n_elements];
42 const base_t *fp = (base_t *)(fieldbuf);
43 for (
unsigned e = 0; e < n_elements; e++) {
44 u.arr[e] = fp[e * field_alloc_size + i];
60 const unsigned field_alloc_size) {
62 using base_t = hila::arithmetic_type<T>;
63 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
65 const base_t *value_f = (base_t *)&value;
66 base_t *fp = (base_t *)(fieldbuf);
67 for (
unsigned e = 0; e < n_elements; e++) {
68 fp[e * field_alloc_size + i] = value_f[e];
75__global__
void get_element_kernel(
field_storage<T> field,
char *buffer,
unsigned i,
76 const unsigned field_alloc_size) {
77 *((T *)buffer) = field.get(i, field_alloc_size);
86 gpuMalloc(&(d_buffer),
sizeof(T));
87 get_element_kernel<<<1, 1>>>(*
this, d_buffer, i, lattice.field_alloc_size());
90 gpuMemcpy((
char *)(&value), d_buffer,
sizeof(T), gpuMemcpyDeviceToHost);
97__global__
void set_element_kernel(
field_storage<T> field, T value,
unsigned i,
98 const unsigned field_alloc_size) {
99 field.set(value, i, field_alloc_size);
103__global__
void set_element_kernel_ptr(
field_storage<T> field,
const T *buf,
unsigned i,
104 const unsigned field_alloc_size) {
105 field.set(*buf, i, field_alloc_size);
116 set_element_kernel<<<1, 1>>>(*
this, t_value, i, lattice.field_alloc_size());
121 gpuMalloc(&(d_buffer),
sizeof(T));
122 gpuMemcpy(d_buffer, (
char *)&t_value,
sizeof(T), gpuMemcpyHostToDevice);
125 set_element_kernel_ptr<<<1, 1>>>(*
this, d_buffer, i, lattice.field_alloc_size());
132__global__
void gather_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
133 const int n,
const unsigned field_alloc_size) {
134 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
136 buffer[Index] = field.get(site_index[Index], field_alloc_size);
144 unsigned *d_site_index;
148 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
149 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
152 gpuMalloc(&(d_buffer), n *
sizeof(T));
154 gather_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
155 lattice.field_alloc_size());
158 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
160 gpuFree(d_site_index);
164#ifdef SPECIAL_BOUNDARY_CONDITIONS
168template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
169__global__
void gather_elements_negated_kernel(
field_storage<T> field, T *buffer,
170 unsigned *site_index,
const int n,
171 const unsigned field_alloc_size) {
172 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
174 buffer[Index] = -field.get(site_index[Index], field_alloc_size);
182 const unsigned *
RESTRICT index_list,
int n,
184 unsigned *d_site_index;
188 assert(
sizeof(T) < 0 &&
"Unary 'operatur- ()' must be defined for Field variable "
189 "for antiperiodic b.c.");
193 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
194 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
197 gpuMalloc(&(d_buffer), n *
sizeof(T));
199 gather_elements_negated_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
200 lattice.field_alloc_size());
203 gpuMemcpy(buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
205 gpuFree(d_site_index);
213__global__
void gather_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
214 const int n,
const unsigned field_alloc_size) {
215 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
217 using base_t = hila::arithmetic_type<T>;
218 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
219 T element = field.get(site_index[Index], field_alloc_size);
220 base_t *ep = (base_t *)&element;
221 base_t *fp = (base_t *)(buffer);
222 for (
unsigned e = 0; e < n_elements; e++) {
223 fp[Index + n * e] = ep[e];
229template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
230__global__
void gather_comm_elements_negated_kernel(
field_storage<T> field, T *buffer,
231 unsigned *site_index,
const int n,
232 const unsigned field_alloc_size) {
233 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
235 using base_t = hila::arithmetic_type<T>;
236 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
237 T element = -field.get(site_index[Index], field_alloc_size);
238 base_t *ep = (base_t *)&element;
239 base_t *fp = (base_t *)(buffer);
240 for (
unsigned e = 0; e < n_elements; e++) {
241 fp[Index + n * e] = ep[e];
247struct cuda_comm_node_struct {
248 const unsigned *cpu_index;
255 static std::vector<struct cuda_comm_node_struct> comm_nodes;
257 const unsigned *cpu_index = to_node.get_sitelist(par, n);
258 for (
struct cuda_comm_node_struct comm_node : comm_nodes) {
259 if (cpu_index == comm_node.cpu_index && n == comm_node.n) {
260 return comm_node.gpu_index;
263 struct cuda_comm_node_struct comm_node;
264 comm_node.cpu_index = cpu_index;
266 gpuMalloc(&(comm_node.gpu_index), n *
sizeof(
unsigned));
267 gpuMemcpy(comm_node.gpu_index, cpu_index, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
268 comm_nodes.push_back(comm_node);
269 return comm_node.gpu_index;
278 bool antiperiodic)
const {
280 unsigned *d_site_index = get_site_index(to_node, par, n);
288 gpuMalloc(&(d_buffer), n *
sizeof(T));
296 gather_comm_elements_negated_kernel<<<N_blocks, N_threads>>>(
297 *
this, d_buffer, d_site_index, n, lattice.field_alloc_size());
301 gather_comm_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
302 lattice.field_alloc_size());
307 gpuMemcpy((
char *)buffer, d_buffer, n *
sizeof(T), gpuMemcpyDeviceToHost);
314__global__
void place_elements_kernel(
field_storage<T> field, T *buffer,
unsigned *site_index,
315 const int n,
const unsigned field_alloc_size) {
316 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
318 field.set(buffer[Index], site_index[Index], field_alloc_size);
326 unsigned *d_site_index;
330 gpuMalloc(&(d_buffer), n *
sizeof(T));
331 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
334 gpuMalloc(&(d_site_index), n *
sizeof(
unsigned));
335 gpuMemcpy(d_site_index, index_list, n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
339 place_elements_kernel<<<N_blocks, N_threads>>>(*
this, d_buffer, d_site_index, n,
340 lattice.field_alloc_size());
343 gpuFree(d_site_index);
346template <typename T, std::enable_if_t<hila::has_unary_minus<T>::value,
int> = 0>
347__global__
void set_local_boundary_elements_kernel(
field_storage<T> field,
unsigned offset,
348 unsigned *site_index,
const int n,
349 const unsigned field_alloc_size) {
350 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
353 value = -field.get(site_index[Index], field_alloc_size);
354 field.set(value, offset + Index, field_alloc_size);
363#ifdef SPECIAL_BOUNDARY_CONDITIONS
366 unsigned n, start = 0;
368 n = lattice.special_boundaries[dir].n_odd;
369 start = lattice.special_boundaries[dir].n_even;
372 n = lattice.special_boundaries[dir].n_even;
374 n = lattice.special_boundaries[dir].n_total;
376 unsigned offset = lattice.special_boundaries[dir].offset + start;
378 unsigned *d_site_index;
379 check_device_error(
"earlier");
380 gpuMalloc(&d_site_index, n *
sizeof(
unsigned));
381 gpuMemcpy(d_site_index, lattice.special_boundaries[dir].move_index + start,
382 n *
sizeof(
unsigned), gpuMemcpyHostToDevice);
385 set_local_boundary_elements_kernel<<<N_blocks, N_threads>>>(
386 *
this, offset, d_site_index, n, lattice.field_alloc_size());
388 gpuFree(d_site_index);
390 assert(
"Antiperiodic b.c. cannot be used with unsigned field elements");
395 assert(!antiperiodic &&
"antiperiodic only with SPECIAL_BOUNDARY_CONDITIONS defined");
401__global__
void place_comm_elements_kernel(
field_storage<T> field, T *buffer,
unsigned offset,
402 const int n,
const unsigned field_alloc_size) {
403 unsigned Index = threadIdx.x + blockIdx.x * blockDim.x;
405 using base_t = hila::arithmetic_type<T>;
406 constexpr unsigned n_elements =
sizeof(T) /
sizeof(base_t);
408 base_t *ep = (base_t *)&element;
409 base_t *fp = (base_t *)(buffer);
410 for (
unsigned e = 0; e < n_elements; e++) {
411 ep[e] = fp[Index + n * e];
413 field.set(element, offset + Index, field_alloc_size);
423 unsigned n = from_node.n_sites(par);
431 gpuMalloc(&(d_buffer), n *
sizeof(T));
432 gpuMemcpy(d_buffer, buffer, n *
sizeof(T), gpuMemcpyHostToDevice);
436 place_comm_elements_kernel<<<N_blocks, N_threads>>>((*this), d_buffer, from_node.offset(par), n,
437 lattice.field_alloc_size());
454 gpuMalloc(&(d_buffer), n *
sizeof(T));
467 return (T *)memalloc(n *
sizeof(T));
The field_storage struct contains minimal information for using the field in a loop....
void place_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice)
CUDA implementation of place_elements without CUDA aware MPI.
void gather_elements(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
CUDA implementation of gather_elements without CUDA aware MPI.
void gather_elements_negated(T *__restrict__ buffer, const unsigned *__restrict__ index_list, int n, const lattice_struct &lattice) const
void set_local_boundary_elements(Direction dir, Parity par, const lattice_struct &lattice, bool antiperiodic)
Place boundary elements from local lattice (used in vectorized version)
auto get_element(const unsigned i, const lattice_struct &lattice) const
void place_comm_elements(Direction d, Parity par, T *__restrict__ buffer, const lattice_struct::comm_node_struct &from_node, const lattice_struct &lattice)
Place boundary elements from neighbour.
Conditionally reture bool type false if type T does't have unary - operator.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
#define GPU_GLOBAL_ARG_MAX_SIZE
GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to obtain meaningful...
#define N_threads
General number of threads in a thread block.
Information necessary to communicate with a node.