20#ifdef SPECIAL_BOUNDARY_CONDITIONS
23 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(-d)) {
24 payload.gather_comm_elements(buffer, to_node, par, lattice,
true);
26 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
29 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
34 bool antiperiodic =
false;
35#ifdef SPECIAL_BOUNDARY_CONDITIONS
36 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(-d)) {
43 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
46 const unsigned *index_list = to_node.get_sitelist(par, n);
48 payload.gather_elements(buffer, index_list, n, lattice);
50 payload.gather_elements_negated(buffer, index_list, n, lattice);
54 payload.gather_comm_vectors(buffer, to_node, par, vector_lattice, antiperiodic);
59 const unsigned *index_list = to_node.get_sitelist(par, n);
61 payload.gather_elements(buffer, index_list, n, lattice);
63 payload.gather_elements_negated(buffer, index_list, n, lattice);
80 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
81 payload.place_recv_elements(buffer, d, par, vector_lattice);
90 payload.place_comm_elements(d, par, buffer, from_node, lattice);
101#ifdef SPECIAL_BOUNDARY_CONDITIONS
103 (boundary_condition[dir] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(dir));
105 bool antiperiodic =
false;
107 payload.set_local_boundary_elements(dir, par, lattice, antiperiodic);
119 return (T *)payload.get_buffer() + from_node.offset(par);
121#elif defined(CUDA) || defined(HIP)
125 offs = from_node.sites / 2;
126 if (receive_buffer[d] ==
nullptr) {
127 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
129 return receive_buffer[d] + offs;
131#elif defined(VECTORIZED)
135 return (T *)payload.get_buffer() + from_node.offset(par);
139 offs = from_node.sites / 2;
141 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
143 if (receive_buffer[d] ==
nullptr) {
144 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
146 return receive_buffer[d] + offs;
150 return ((T *)payload.get_buffer() +
151 (vector_lattice->halo_offset[d] * vector_size + offs));
159#if defined(NAIVE_SHIFT)
181 par_s = opp_parity(par);
186 bool found_dir =
false;
189 if (rem[d] > 0 && gather_status(par_s, d) != gather_status_t::NOT_DONE) {
193 }
else if (rem[d] < 0 && gather_status(par_s, -d) != gather_status_t::NOT_DONE) {
206 }
else if (rem[d] < 0) {
215 res[par_s] = (*this)[X + mdir];
232 (*from)[par_s] = (*this)[X + mdir];
236 par_s = opp_parity(par_s);
240 mdir = (rem[d] > 0) ? d : -d;
242 while (rem[d] != 0) {
244 (*to)[par_s] = (*from)[X + mdir];
246 par_s = opp_parity(par_s);
268 int tag = get_next_msg_tag();
276 if (is_gathered(d, p)) {
277 lattice.n_gather_avoided++;
285 fs->set_local_boundary_elements(d, p);
291 if (!gather_not_done(d, p) || !gather_not_done(d,
ALL)) {
292 lattice.n_gather_avoided++;
293 return get_dir_mask(d);
300 if (!gather_not_done(d,
EVEN) && !gather_not_done(d,
ODD)) {
302 lattice.n_gather_avoided++;
303 return get_dir_mask(d);
305 if (!gather_not_done(d,
EVEN))
307 else if (!gather_not_done(d,
ODD))
312 mark_gather_started(d, par);
316 int par_i =
static_cast<int>(par) - 1;
318 constexpr size_t size =
sizeof(T);
324 MPI_Datatype mpi_type = get_MPI_number_type<T>(size_type);
326 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
331 receive_buffer = fs->get_receive_buffer(d, par, from_node);
334 size_t n = from_node.n_sites(par) * size;
336 if (n >= (1ULL << 31)) {
337 hila::out <<
"Too large MPI message! Size " << n <<
'\n';
341 post_receive_timer.start();
345 MPI_Irecv(receive_buffer, (
int)n, MPI_BYTE, from_node.rank, tag, lattice.mpi_comm_lat,
346 &fs->receive_request[par_i][d]);
348 post_receive_timer.stop();
351 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
354 unsigned sites = to_node.n_sites(par);
356 if (fs->send_buffer[d] ==
nullptr)
357 fs->send_buffer[d] = fs->payload.allocate_mpi_buffer(to_node.sites);
359 send_buffer = fs->send_buffer[d] + to_node.offset(par);
361#ifndef MPI_BENCHMARK_TEST
362 fs->gather_comm_elements(d, par, send_buffer, to_node);
366 size_t n = sites * size;
369 gpuStreamSynchronize(0);
373 start_send_timer.start();
376 MPI_Isend(send_buffer, (
int)n, MPI_BYTE, to_node.rank, tag, lattice.mpi_comm_lat,
377 &fs->send_request[par_i][d]);
379 start_send_timer.stop();
386#ifndef MPI_BENCHMARK_TEST
387 fs->set_local_boundary_elements(d, par);
390 return get_dir_mask(d);
409 if (is_gathered(d, p))
427 assert(!(p !=
ALL && is_gather_started(d, p) && is_gather_started(d,
ALL)));
432 if (is_gather_started(d, p))
435 if (is_gather_started(d,
ALL))
442 if (is_gathered(d,
EVEN) && is_gather_started(d,
ODD))
444 else if (is_gathered(d,
ODD) && is_gather_started(d,
EVEN))
446 else if (is_gather_started(d,
EVEN) && is_gather_started(d,
ODD)) {
454 for (
int wait_i = 0; wait_i < n_wait; ++wait_i) {
456 int par_i = (int)par - 1;
458 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
459 wait_receive_timer.start();
462 MPI_Wait(&fs->receive_request[par_i][d], &status);
464 wait_receive_timer.stop();
466#if !defined(VANILLA) && !defined(MPI_BENCHMARK_TEST)
467 fs->place_comm_elements(d, par, fs->get_receive_buffer(d, par, from_node), from_node);
472 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
473 wait_send_timer.start();
475 MPI_Wait(&fs->send_request[par_i][d], &status);
476 wait_send_timer.stop();
480 mark_gathered(d, par);
483 lattice.n_gather_done += 1;
485 par = opp_parity(par);
494 const std::vector<CoordinateVector> &coord_list,
497 std::vector<unsigned> index_list;
498 std::vector<int> sites_on_rank(lattice.n_nodes());
499 std::vector<int> reshuffle_list(coord_list.size());
501 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
507 int rank = lattice.node_rank(c);
512 if (sites_on_rank[rank] == 0 && rank != root)
514 sites_on_rank[rank]++;
515 reshuffle_list[i++] = rank;
518 std::vector<T> send_buffer(index_list.size());
519 payload.gather_elements((T *)send_buffer.data(), index_list.data(), send_buffer.size(),
522 MPI_Send((
char *)send_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
529 std::vector<T> pb(coord_list.size() - sites_on_rank[root]);
532 std::vector<T *> nptr(lattice.n_nodes());
534 std::vector<MPI_Request> mpi_req(nranks);
536 for (
int n = 0; n < sites_on_rank.size(); n++) {
537 if (sites_on_rank[n] > 0) {
540 MPI_Irecv(b, (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n, n,
541 lattice.mpi_comm_lat, &mpi_req[nreqs++]);
544 b += sites_on_rank[n];
548 nptr[n] = send_buffer.data();
554 std::vector<MPI_Status> stat_arr(nreqs);
555 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
559 for (
int i = 0; i < coord_list.size(); i++) {
560 buffer[i] = *nptr[reshuffle_list[i]];
561 nptr[reshuffle_list[i]]++;
571 const std::vector<CoordinateVector> &coord_list,
574 std::vector<unsigned> index_list;
575 std::vector<int> sites_on_rank(lattice.n_nodes());
576 std::vector<int> reshuffle_list(coord_list.size());
577 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
582 int rank = lattice.node_rank(c);
587 if (sites_on_rank[rank] == 0 && rank != root)
589 sites_on_rank[rank]++;
590 reshuffle_list[i++] = rank;
597 std::vector<T> recv_buffer(index_list.size());
600 MPI_Recv((
char *)recv_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
603 payload.place_elements((T *)recv_buffer.data(), index_list.data(), recv_buffer.size(),
608 std::vector<T> pb(coord_list.size());
610 std::vector<unsigned> nloc(lattice.n_nodes());
611 std::vector<unsigned> ncount(lattice.n_nodes());
612 nloc[0] = ncount[0] = 0;
614 for (
int n = 1; n < lattice.n_nodes(); n++) {
615 nloc[n] = nloc[n - 1] + sites_on_rank[n - 1];
618 for (
int i = 0; i < coord_list.size(); i++) {
619 int node = reshuffle_list[i];
620 pb[nloc[node] + ncount[node]] = buffer[i];
624 std::vector<MPI_Request> mpi_req(nranks);
626 for (
int n = 0; n < sites_on_rank.size(); n++) {
627 if (sites_on_rank[n] > 0) {
629 MPI_Isend(pb.data() + nloc[n], (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n,
630 n, lattice.mpi_comm_lat, &mpi_req[nreqs++]);
635 payload.place_elements(pb.data() + nloc[root], index_list.data(), index_list.size(),
639 std::vector<MPI_Status> stat_arr(nreqs);
640 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
647 const std::vector<CoordinateVector> &coord_list) {
648 assert(elements.size() == coord_list.size() &&
"vector size mismatch in set_elments");
649 std::vector<unsigned> my_indexes;
650 std::vector<T> my_elements;
651 for (
int i = 0; i < coord_list.size(); i++) {
655 my_elements.push_back(elements[i]);
658 fs->payload.place_elements(my_elements.data(), my_indexes.data(), my_indexes.size(), lattice);
670 res.resize(coord_list.size());
672 fs->gather_elements(res.data(), coord_list);
687 vol *= cmax[d] - cmin[d] + 1;
688 assert(cmax[d] >= cmin[d] && cmin[d] >= 0 && cmax[d] < lattice.size(d));
690 std::vector<CoordinateVector> clist(vol);
694 forcoordinaterange(c, cmin, cmax) {
697 return get_elements(clist, bcast);
707 cmax[d] = lattice.size(d) - 1;
710 cmin[d] = cmax[d] = c[d];
712 return get_subvolume(cmin, cmax, bcast);
728 buffer.resize(lattice.mynode.volume());
729#if defined(CUDA) || defined(HIP)
731 T *data = (T *)d_malloc(
sizeof(T) * lattice.mynode.volume());
733 T *data = buffer.data();
736#pragma hila novector direct_access(data)
739 nodec = X.coordinates() - nmin;
741 unsigned i = nodec.
dot(nmul);
742 data[i] = (*this)[X];
745#if defined(CUDA) || defined(HIP)
746 gpuMemcpy(buffer.data(), data,
sizeof(T) * lattice.mynode.volume(), gpuMemcpyDeviceToHost);
762 assert(buffer.size() >= lattice.mynode.volume());
764#if defined(CUDA) || defined(HIP)
766 T *data = (T *)d_malloc(
sizeof(T) * lattice.mynode.volume());
767 gpuMemcpy(data, buffer.data(),
sizeof(T) * lattice.mynode.volume(), gpuMemcpyHostToDevice);
769 T *data = buffer.data();
772#pragma hila novector direct_access(data)
775 nodec = X.coordinates() - nmin;
777 unsigned i = nodec.
dot(nmul);
778 (*this)[X] = data[i];
781#if defined(CUDA) || defined(HIP)
785 this->mark_changed(
ALL);
803 for (
int i = 1; i < NDIM; i++)
804 nmul.
e(i) = nmul.
e(i - 1) * (lattice.mynode.size[i - 1] + 2);
810 node_min[d] = lattice.mynode.min[d];
811 node_max[d] = lattice.mynode.min[d] + lattice.mynode.size[d] - 1;
814#pragma hila novector direct_access(data)
820 for (
int i = 0; i < ndir; i++) {
822 if (c.
e(d) == node_min[d]) {
824 }
else if (c.
e(d) == node_max[d]) {
831 if (gotit && c.
e(d) == node_min[d]) {
834 data[nodec.
dot(nmul)] = src[X - d];
835 dest[X] = src[X - d];
837 }
else if (gotit && c.
e(d) == node_max[d]) {
840 data[nodec.
dot(nmul)] = src[X + d];
841 dest[X] = src[X + d];
859 for (
int i = 1; i < NDIM; i++)
860 nmul.
e(i) = nmul.
e(i - 1) * (lattice.mynode.size[i - 1] + 2);
864 foralldir(d) siz *= (lattice.mynode.size[d] + 2);
867#if defined(CUDA) || defined(HIP)
869 T *data = (T *)d_malloc(
sizeof(T) * siz);
871 T *data = buffer.data();
875#pragma hila novector direct_access(data)
878 nodec = X.coordinates() - nmin;
880 unsigned i = nodec.
dot(nmul);
881 data[i] = (*this)[X];
901 for (
int d2 = d1 + 1; d2 < NDIM; ++d2) {
905 for (
int d3 = d2 + 1; d3 < NDIM; ++d3) {
909 for (
int d4 = d3 + 1; d4 < NDIM; ++d4) {
920#if defined(CUDA) || defined(HIP)
921 gpuMemcpy(buffer.data(), data,
sizeof(T) * siz, gpuMemcpyDeviceToHost);
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
std::vector< T > get_subvolume(const CoordinateVector &cmin, const CoordinateVector &cmax, bool broadcast=false) const
get a subvolume of the field elements to all nodes
Field< T > & shift(const CoordinateVector &v, Field< T > &r, Parity par) const
Create a periodically shifted copy of the field.
dir_mask_t start_gather(Direction d, Parity p=ALL) const
std::vector< T > get_elements(const std::vector< CoordinateVector > &coord_list, bool broadcast=false) const
Get a list of elements and store them into a vector.
std::vector< T > get_slice(const CoordinateVector &c, bool broadcast=false) const
and get a slice (subvolume)
T e(const int i, const int j) const
Standard array indexing operation for matrices and vectors.
R dot(const Matrix< p, q, S > &rhs) const
Dot product.
const Array< n, m, T > & asArray() const
Cast Matrix to Array.
Matrix class which defines matrix operations.
bool is_on_mynode(const CoordinateVector &c) const
Is the coordinate on THIS node.
std::array< nn_comminfo_struct, NDIRS > nn_comminfo
nearest neighbour comminfo struct
unsigned site_index(const CoordinateVector &c) const
T abs(const Complex< T > &a)
Return absolute value of Complex number.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
#define foralldir(d)
Macro to loop over (all) Direction(s)
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
constexpr Parity ALL
bit pattern: 011
This files containts definitions for the Field class and the classes required to define it such as fi...
void collect_field_halo_data_(T *data, const Field< T > &src, Field< T > &dest, const Vector< 4, int > &dirs, int ndir)
Copy local data with halo - useful for visualization.
int myrank()
rank of this node
std::ostream out
this is our default output file stream
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).
void terminate(int status)
is_vectorizable_type<T>::value is always false if the target is not vectorizable
Information necessary to communicate with a node.
nn-communication has only 1 node to talk to
bool is_on_edge(Direction d) const
true if this node is on the edge of the lattice to dir d