20#ifdef SPECIAL_BOUNDARY_CONDITIONS
23 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(-d)) {
24 payload.gather_comm_elements(buffer, to_node, par, lattice,
true);
26 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
29 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
34 bool antiperiodic =
false;
35#ifdef SPECIAL_BOUNDARY_CONDITIONS
36 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(-d)) {
43 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
46 const unsigned *index_list = to_node.get_sitelist(par, n);
48 payload.gather_elements(buffer, index_list, n, lattice);
50 payload.gather_elements_negated(buffer, index_list, n, lattice);
54 payload.gather_comm_vectors(buffer, to_node, par, vector_lattice, antiperiodic);
59 const unsigned *index_list = to_node.get_sitelist(par, n);
61 payload.gather_elements(buffer, index_list, n, lattice);
63 payload.gather_elements_negated(buffer, index_list, n, lattice);
80 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
81 payload.place_recv_elements(buffer, d, par, vector_lattice);
90 payload.place_comm_elements(d, par, buffer, from_node, lattice);
101#ifdef SPECIAL_BOUNDARY_CONDITIONS
103 (boundary_condition[dir] == hila::bc::ANTIPERIODIC && lattice.mynode.
is_on_edge(dir));
105 bool antiperiodic =
false;
107 payload.set_local_boundary_elements(dir, par, lattice, antiperiodic);
119 return (T *)payload.get_buffer() + from_node.offset(par);
121#elif defined(CUDA) || defined(HIP)
125 offs = from_node.sites / 2;
126 if (receive_buffer[d] ==
nullptr) {
127 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
129 return receive_buffer[d] + offs;
131#elif defined(VECTORIZED)
135 return (T *)payload.get_buffer() + from_node.offset(par);
139 offs = from_node.sites / 2;
141 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
143 if (receive_buffer[d] ==
nullptr) {
144 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
146 return receive_buffer[d] + offs;
150 return ((T *)payload.get_buffer() +
151 (vector_lattice->halo_offset[d] * vector_size + offs));
159#if defined(NAIVE_SHIFT)
181 par_s = opp_parity(par);
186 bool found_dir =
false;
189 if (rem[d] > 0 && gather_status(par_s, d) != gather_status_t::NOT_DONE) {
193 }
else if (rem[d] < 0 && gather_status(par_s, -d) != gather_status_t::NOT_DONE) {
206 }
else if (rem[d] < 0) {
215 res[par_s] = (*this)[X + mdir];
232 (*from)[par_s] = (*this)[X + mdir];
236 par_s = opp_parity(par_s);
240 mdir = (rem[d] > 0) ? d : -d;
242 while (rem[d] != 0) {
244 (*to)[par_s] = (*from)[X + mdir];
246 par_s = opp_parity(par_s);
268 int tag = get_next_msg_tag();
276 if (is_gathered(d, p)) {
277 lattice.n_gather_avoided++;
285 fs->set_local_boundary_elements(d, p);
291 if (!gather_not_done(d, p) || !gather_not_done(d,
ALL)) {
292 lattice.n_gather_avoided++;
293 return get_dir_mask(d);
300 if (!gather_not_done(d,
EVEN) && !gather_not_done(d,
ODD)) {
302 lattice.n_gather_avoided++;
303 return get_dir_mask(d);
305 if (!gather_not_done(d,
EVEN))
307 else if (!gather_not_done(d,
ODD))
312 mark_gather_started(d, par);
316 int par_i =
static_cast<int>(par) - 1;
318 constexpr size_t size =
sizeof(T);
324 MPI_Datatype mpi_type = get_MPI_number_type<T>(size_type);
326 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
331 receive_buffer = fs->get_receive_buffer(d, par, from_node);
333 size_t n = from_node.n_sites(par) * size / size_type;
335 if (n >= (1ULL << 31)) {
336 hila::out <<
"Too large MPI message! Size " << n <<
'\n';
340 post_receive_timer.start();
343 MPI_Irecv(receive_buffer, (
int)n, mpi_type, from_node.rank, tag, lattice.mpi_comm_lat,
344 &fs->receive_request[par_i][d]);
346 post_receive_timer.stop();
349 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
352 unsigned sites = to_node.n_sites(par);
354 if (fs->send_buffer[d] ==
nullptr)
355 fs->send_buffer[d] = fs->payload.allocate_mpi_buffer(to_node.sites);
357 send_buffer = fs->send_buffer[d] + to_node.offset(par);
359 fs->gather_comm_elements(d, par, send_buffer, to_node);
361 size_t n = sites * size / size_type;
363 gpuStreamSynchronize(0);
367 start_send_timer.start();
369 MPI_Isend(send_buffer, (
int)n, mpi_type, to_node.rank, tag, lattice.mpi_comm_lat,
370 &fs->send_request[par_i][d]);
372 start_send_timer.stop();
379 fs->set_local_boundary_elements(d, par);
381 return get_dir_mask(d);
402 if (is_gathered(d, p))
420 if (p !=
ALL && is_gather_started(d, p) && is_gather_started(d,
ALL)) {
427 if (is_gather_started(d, p))
430 if (is_gather_started(d,
ALL))
437 if (is_gathered(d,
EVEN) && is_gather_started(d,
ODD))
439 else if (is_gathered(d,
ODD) && is_gather_started(d,
EVEN))
441 else if (is_gather_started(d,
EVEN) && is_gather_started(d,
ODD)) {
452 for (
int wait_i = 0; wait_i < n_wait; ++wait_i) {
454 int par_i = (int)par - 1;
456 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
457 wait_receive_timer.start();
460 MPI_Wait(&fs->receive_request[par_i][d], &status);
462 wait_receive_timer.stop();
465 fs->place_comm_elements(d, par, fs->get_receive_buffer(d, par, from_node), from_node);
470 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
471 wait_send_timer.start();
473 MPI_Wait(&fs->send_request[par_i][d], &status);
474 wait_send_timer.stop();
478 mark_gathered(d, par);
481 lattice.n_gather_done += 1;
483 par = opp_parity(par);
492 const std::vector<CoordinateVector> &coord_list,
495 std::vector<unsigned> index_list;
496 std::vector<int> sites_on_rank(lattice.n_nodes());
497 std::vector<int> reshuffle_list(coord_list.size());
499 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
505 int rank = lattice.node_rank(c);
510 if (sites_on_rank[rank] == 0 && rank != root)
512 sites_on_rank[rank]++;
513 reshuffle_list[i++] = rank;
516 std::vector<T> send_buffer(index_list.size());
517 payload.gather_elements((T *)send_buffer.data(), index_list.data(), send_buffer.size(),
520 MPI_Send((
char *)send_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
527 std::vector<T> pb(coord_list.size() - sites_on_rank[root]);
530 std::vector<T *> nptr(lattice.n_nodes());
532 std::vector<MPI_Request> mpi_req(nranks);
534 for (
int n = 0; n < sites_on_rank.size(); n++) {
535 if (sites_on_rank[n] > 0) {
538 MPI_Irecv(b, (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n, n,
539 lattice.mpi_comm_lat, &mpi_req[nreqs++]);
542 b += sites_on_rank[n];
546 nptr[n] = send_buffer.data();
552 std::vector<MPI_Status> stat_arr(nreqs);
553 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
557 for (
int i = 0; i < coord_list.size(); i++) {
558 buffer[i] = *nptr[reshuffle_list[i]];
559 nptr[reshuffle_list[i]]++;
569 const std::vector<CoordinateVector> &coord_list,
572 std::vector<unsigned> index_list;
573 std::vector<int> sites_on_rank(lattice.n_nodes());
574 std::vector<int> reshuffle_list(coord_list.size());
575 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
580 int rank = lattice.node_rank(c);
585 if (sites_on_rank[rank] == 0 && rank != root)
587 sites_on_rank[rank]++;
588 reshuffle_list[i++] = rank;
595 std::vector<T> recv_buffer(index_list.size());
598 MPI_Recv((
char *)recv_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
601 payload.place_elements((T *)recv_buffer.data(), index_list.data(), recv_buffer.size(),
606 std::vector<T> pb(coord_list.size());
608 std::vector<unsigned> nloc(lattice.n_nodes());
609 std::vector<unsigned> ncount(lattice.n_nodes());
610 nloc[0] = ncount[0] = 0;
612 for (
int n = 1; n < lattice.n_nodes(); n++) {
613 nloc[n] = nloc[n - 1] + sites_on_rank[n - 1];
616 for (
int i = 0; i < coord_list.size(); i++) {
617 int node = reshuffle_list[i];
618 pb[nloc[node] + ncount[node]] = buffer[i];
622 std::vector<MPI_Request> mpi_req(nranks);
624 for (
int n = 0; n < sites_on_rank.size(); n++) {
625 if (sites_on_rank[n] > 0) {
627 MPI_Isend(pb.data() + nloc[n], (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n,
628 n, lattice.mpi_comm_lat, &mpi_req[nreqs++]);
633 payload.place_elements(pb.data() + nloc[root], index_list.data(), index_list.size(),
637 std::vector<MPI_Status> stat_arr(nreqs);
638 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
645 const std::vector<CoordinateVector> &coord_list) {
646 assert(elements.size() == coord_list.size() &&
"vector size mismatch in set_elments");
647 std::vector<unsigned> my_indexes;
648 std::vector<T> my_elements;
649 for (
int i = 0; i < coord_list.size(); i++) {
653 my_elements.push_back(elements[i]);
656 fs->payload.place_elements(my_elements.data(), my_indexes.data(), my_indexes.size(), lattice);
668 res.resize(coord_list.size());
670 fs->gather_elements(res.data(), coord_list);
685 vol *= cmax[d] - cmin[d] + 1;
686 assert(cmax[d] >= cmin[d] && cmin[d] >= 0 && cmax[d] < lattice.size(d));
688 std::vector<CoordinateVector> clist(vol);
692 forcoordinaterange(c, cmin, cmax) {
695 return get_elements(clist, bcast);
705 cmax[d] = lattice.size(d) - 1;
708 cmin[d] = cmax[d] = c[d];
710 return get_subvolume(cmin, cmax, bcast);
726 buffer.resize(lattice.mynode.volume());
727#if defined(CUDA) || defined(HIP)
729 T *data = (T *)d_malloc(
sizeof(T) * lattice.mynode.volume());
731 T *data = buffer.data();
734#pragma hila novector direct_access(data)
737 nodec = X.coordinates() - nmin;
739 unsigned i = nodec.
dot(nmul);
740 data[i] = (*this)[X];
743#if defined(CUDA) || defined(HIP)
744 gpuMemcpy(buffer.data(), data,
sizeof(T) * lattice.mynode.volume(), gpuMemcpyDeviceToHost);
760 assert(buffer.size() >= lattice.mynode.volume());
762#if defined(CUDA) || defined(HIP)
764 T *data = (T *)d_malloc(
sizeof(T) * lattice.mynode.volume());
765 gpuMemcpy(data, buffer.data(),
sizeof(T) * lattice.mynode.volume(), gpuMemcpyHostToDevice);
767 T *data = buffer.data();
770#pragma hila novector direct_access(data)
773 nodec = X.coordinates() - nmin;
775 unsigned i = nodec.
dot(nmul);
776 (*this)[X] = data[i];
779#if defined(CUDA) || defined(HIP)
783 this->mark_changed(
ALL);
801 for (
int i = 1; i < NDIM; i++)
802 nmul.
e(i) = nmul.
e(i - 1) * (lattice.mynode.size[i - 1] + 2);
808 node_min[d] = lattice.mynode.min[d];
809 node_max[d] = lattice.mynode.min[d] + lattice.mynode.size[d] - 1;
812#pragma hila novector direct_access(data)
818 for (
int i = 0; i < ndir; i++) {
820 if (c.
e(d) == node_min[d]) {
822 }
else if (c.
e(d) == node_max[d]) {
829 if (gotit && c.
e(d) == node_min[d]) {
832 data[nodec.
dot(nmul)] = src[X - d];
833 dest[X] = src[X - d];
835 }
else if (gotit && c.
e(d) == node_max[d]) {
838 data[nodec.
dot(nmul)] = src[X + d];
839 dest[X] = src[X + d];
857 for (
int i = 1; i < NDIM; i++)
858 nmul.
e(i) = nmul.
e(i - 1) * (lattice.mynode.size[i - 1] + 2);
862 foralldir(d) siz *= (lattice.mynode.size[d] + 2);
865#if defined(CUDA) || defined(HIP)
867 T *data = (T *)d_malloc(
sizeof(T) * siz);
869 T *data = buffer.data();
873#pragma hila novector direct_access(data)
876 nodec = X.coordinates() - nmin;
878 unsigned i = nodec.
dot(nmul);
879 data[i] = (*this)[X];
899 for (
int d2 = d1 + 1; d2 < NDIM; ++d2) {
903 for (
int d3 = d2 + 1; d3 < NDIM; ++d3) {
907 for (
int d4 = d3 + 1; d4 < NDIM; ++d4) {
918#if defined(CUDA) || defined(HIP)
919 gpuMemcpy(buffer.data(), data,
sizeof(T) * siz, gpuMemcpyDeviceToHost);
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
std::vector< T > get_subvolume(const CoordinateVector &cmin, const CoordinateVector &cmax, bool broadcast=false) const
get a subvolume of the field elements to all nodes
Field< T > & shift(const CoordinateVector &v, Field< T > &r, Parity par) const
Create a periodically shifted copy of the field.
dir_mask_t start_gather(Direction d, Parity p=ALL) const
std::vector< T > get_elements(const std::vector< CoordinateVector > &coord_list, bool broadcast=false) const
Get a list of elements and store them into a vector.
std::vector< T > get_slice(const CoordinateVector &c, bool broadcast=false) const
and get a slice (subvolume)
T e(const int i, const int j) const
Standard array indexing operation for matrices and vectors.
R dot(const Matrix< p, q, S > &rhs) const
Dot product.
const Array< n, m, T > & asArray() const
Cast Matrix to Array.
Matrix class which defines matrix operations.
bool is_on_mynode(const CoordinateVector &c) const
Is the coordinate on THIS node.
std::array< nn_comminfo_struct, NDIRS > nn_comminfo
nearest neighbour comminfo struct
unsigned site_index(const CoordinateVector &c) const
T abs(const Complex< T > &a)
Return absolute value of Complex number.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
#define foralldir(d)
Macro to loop over (all) Direction(s)
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
constexpr Parity ALL
bit pattern: 011
This files containts definitions for the Field class and the classes required to define it such as fi...
void collect_field_halo_data_(T *data, const Field< T > &src, Field< T > &dest, const Vector< 4, int > &dirs, int ndir)
Copy local data with halo - useful for visualization.
int myrank()
rank of this node
std::ostream out
this is our default output file stream
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).
void terminate(int status)
is_vectorizable_type<T>::value is always false if the target is not vectorizable
Information necessary to communicate with a node.
nn-communication has only 1 node to talk to
bool is_on_edge(Direction d) const
true if this node is on the edge of the lattice to dir d