20#ifdef SPECIAL_BOUNDARY_CONDITIONS
23 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(-d)) {
24 payload.gather_comm_elements(buffer, to_node, par, lattice,
true);
26 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
29 payload.gather_comm_elements(buffer, to_node, par, lattice,
false);
34 bool antiperiodic =
false;
35#ifdef SPECIAL_BOUNDARY_CONDITIONS
36 if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(-d)) {
43 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
46 const unsigned *index_list = to_node.get_sitelist(par, n);
48 payload.gather_elements(buffer, index_list, n, lattice);
50 payload.gather_elements_negated(buffer, index_list, n, lattice);
54 payload.gather_comm_vectors(buffer, to_node, par, vector_lattice, antiperiodic);
59 const unsigned *index_list = to_node.get_sitelist(par, n);
61 payload.gather_elements(buffer, index_list, n, lattice);
63 payload.gather_elements_negated(buffer, index_list, n, lattice);
80 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
81 payload.place_recv_elements(buffer, d, par, vector_lattice);
90 payload.place_comm_elements(d, par, buffer, from_node, lattice);
101#ifdef SPECIAL_BOUNDARY_CONDITIONS
103 (boundary_condition[dir] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(dir));
105 bool antiperiodic =
false;
107 payload.set_local_boundary_elements(dir, par, lattice, antiperiodic);
119 return (T *)payload.get_buffer() + from_node.offset(par);
121#elif defined(CUDA) || defined(HIP)
125 offs = from_node.evensites;
126 if (receive_buffer[d] ==
nullptr) {
127 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
129 return receive_buffer[d] + offs;
131#elif defined(VECTORIZED)
135 return (T *)payload.get_buffer() + from_node.offset(par);
139 offs = from_node.evensites;
141 if (vector_lattice->is_boundary_permutation[
abs(d)]) {
143 if (receive_buffer[d] ==
nullptr) {
144 receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
146 return receive_buffer[d] + offs;
150 return ((T *)payload.get_buffer() +
151 (vector_lattice->halo_offset[d] * vector_size + offs));
159#if defined(NAIVE_SHIFT)
181 par_s = opp_parity(par);
186 bool found_dir =
false;
189 if (rem[d] > 0 && gather_status(par_s, d) != gather_status_t::NOT_DONE) {
193 }
else if (rem[d] < 0 && gather_status(par_s, -d) != gather_status_t::NOT_DONE) {
206 }
else if (rem[d] < 0) {
215 res[par_s] = (*this)[X + mdir];
232 (*from)[par_s] = (*this)[X + mdir];
236 par_s = opp_parity(par_s);
240 mdir = (rem[d] > 0) ? d : -d;
242 while (rem[d] != 0) {
244 (*to)[par_s] = (*from)[X + mdir];
246 par_s = opp_parity(par_s);
268 int tag = get_next_msg_tag();
276 if (is_gathered(d, p)) {
277 hila::n_gather_avoided++;
285 fs->set_local_boundary_elements(d, p);
291 if (!gather_not_done(d, p) || !gather_not_done(d,
ALL)) {
292 hila::n_gather_avoided++;
293 return get_dir_mask(d);
300 if (!gather_not_done(d,
EVEN) && !gather_not_done(d,
ODD)) {
302 hila::n_gather_avoided++;
303 return get_dir_mask(d);
305 if (!gather_not_done(d,
EVEN))
307 else if (!gather_not_done(d,
ODD))
312 mark_gather_started(d, par);
316 int par_i =
static_cast<int>(par) - 1;
318 constexpr size_t size =
sizeof(T);
323 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
328 receive_buffer = fs->get_receive_buffer(d, par, from_node);
330 size_t n = from_node.n_sites(par) * size;
332 if (n >= (1ULL << 31)) {
333 hila::out <<
"Too large MPI message! Size " << n <<
'\n';
337 post_receive_timer.start();
340 MPI_Irecv(receive_buffer, (
int)n, MPI_BYTE, from_node.rank, tag, lattice->mpi_comm_lat,
341 &fs->receive_request[par_i][d]);
343 post_receive_timer.stop();
346 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
349 unsigned sites = to_node.n_sites(par);
351 if (fs->send_buffer[d] ==
nullptr)
352 fs->send_buffer[d] = fs->payload.allocate_mpi_buffer(to_node.sites);
354 send_buffer = fs->send_buffer[d] + to_node.offset(par);
356#ifndef MPI_BENCHMARK_TEST
357 fs->gather_comm_elements(d, par, send_buffer, to_node);
360 size_t n = sites * size;
363 gpuStreamSynchronize(0);
367 start_send_timer.start();
369 MPI_Isend(send_buffer, (
int)n, MPI_BYTE, to_node.rank, tag, lattice->mpi_comm_lat,
370 &fs->send_request[par_i][d]);
372 start_send_timer.stop();
379#ifndef MPI_BENCHMARK_TEST
380 fs->set_local_boundary_elements(d, par);
383 return get_dir_mask(d);
402 if (is_gathered(d, p))
420 assert(!(p !=
ALL && is_gather_started(d, p) && is_gather_started(d,
ALL)));
425 if (is_gather_started(d, p))
428 if (is_gather_started(d,
ALL))
435 if (is_gathered(d,
EVEN) && is_gather_started(d,
ODD))
437 else if (is_gathered(d,
ODD) && is_gather_started(d,
EVEN))
439 else if (is_gather_started(d,
EVEN) && is_gather_started(d,
ODD)) {
447 for (
int wait_i = 0; wait_i < n_wait; ++wait_i) {
449 int par_i = (int)par - 1;
451 if (from_node.rank !=
hila::myrank() && boundary_need_to_communicate(d)) {
452 wait_receive_timer.start();
455 MPI_Wait(&fs->receive_request[par_i][d], &status);
457 wait_receive_timer.stop();
459#if !defined(VANILLA) && !defined(MPI_BENCHMARK_TEST)
460 fs->place_comm_elements(d, par, fs->get_receive_buffer(d, par, from_node), from_node);
465 if (to_node.rank !=
hila::myrank() && boundary_need_to_communicate(-d)) {
466 wait_send_timer.start();
468 MPI_Wait(&fs->send_request[par_i][d], &status);
469 wait_send_timer.stop();
473 mark_gathered(d, par);
476 hila::n_gather_done += 1;
478 par = opp_parity(par);
487 const std::vector<CoordinateVector> &coord_list,
490 std::vector<unsigned> index_list;
491 std::vector<int> sites_on_rank(lattice->nodes.number);
492 std::vector<int> reshuffle_list(coord_list.size());
494 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
505 if (sites_on_rank[rank] == 0 && rank != root)
507 sites_on_rank[rank]++;
508 reshuffle_list[i++] = rank;
511 std::vector<T> send_buffer(index_list.size());
512 payload.gather_elements((T *)send_buffer.data(), index_list.data(), send_buffer.size(),
515 MPI_Send((
char *)send_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
522 std::vector<T> pb(coord_list.size() - sites_on_rank[root]);
525 std::vector<T *> nptr(lattice->nodes.number);
527 std::vector<MPI_Request> mpi_req(nranks);
529 for (
int n = 0; n < sites_on_rank.size(); n++) {
530 if (sites_on_rank[n] > 0) {
533 MPI_Irecv(b, (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n, n,
534 lattice->mpi_comm_lat, &mpi_req[nreqs++]);
537 b += sites_on_rank[n];
541 nptr[n] = send_buffer.data();
547 std::vector<MPI_Status> stat_arr(nreqs);
548 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
552 for (
int i = 0; i < coord_list.size(); i++) {
553 buffer[i] = *nptr[reshuffle_list[i]];
554 nptr[reshuffle_list[i]]++;
564 const std::vector<CoordinateVector> &coord_list,
567 std::vector<unsigned> index_list;
568 std::vector<int> sites_on_rank(lattice->nodes.number);
569 std::vector<int> reshuffle_list(coord_list.size());
570 std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
580 if (sites_on_rank[rank] == 0 && rank != root)
582 sites_on_rank[rank]++;
583 reshuffle_list[i++] = rank;
590 std::vector<T> recv_buffer(sites_on_rank[
hila::myrank()]);
593 MPI_Recv((
char *)recv_buffer.data(), sites_on_rank[
hila::myrank()] *
sizeof(T), MPI_BYTE,
596 payload.place_elements((T *)recv_buffer.data(), index_list.data(), recv_buffer.size(),
601 std::vector<T> pb(coord_list.size());
603 std::vector<unsigned> nloc(lattice->nodes.number);
604 std::vector<unsigned> ncount(lattice->nodes.number);
605 nloc[0] = ncount[0] = 0;
607 for (
int n = 1; n < lattice->nodes.number; n++) {
608 nloc[n] = nloc[n - 1] + sites_on_rank[n - 1];
611 for (
size_t i = 0; i < coord_list.size(); i++) {
612 int node = reshuffle_list[i];
613 pb[nloc[node] + ncount[node]] = buffer[i];
616 std::vector<MPI_Request> mpi_req(nranks);
618 for (
int n = 0; n < sites_on_rank.size(); n++) {
619 if (sites_on_rank[n] > 0) {
621 MPI_Isend(pb.data() + nloc[n], (
int)(sites_on_rank[n] *
sizeof(T)), MPI_BYTE, n,
622 n, lattice->mpi_comm_lat, &mpi_req[nreqs++]);
626 if(sites_on_rank[root] > 0) {
627 payload.place_elements(pb.data() + nloc[root], index_list.data(), index_list.size(),
631 std::vector<MPI_Status> stat_arr(nreqs);
632 MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
640 const std::vector<CoordinateVector> &coord_list) {
641 assert(elements.size() == coord_list.size() &&
"vector size mismatch in set_elments");
642 std::vector<unsigned> my_indexes;
643 std::vector<T> my_elements;
644 for (
int i = 0; i < coord_list.size(); i++) {
648 my_elements.push_back(elements[i]);
651 fs->payload.place_elements(my_elements.data(), my_indexes.data(), my_indexes.size(), lattice);
663 res.resize(coord_list.size());
665 fs->gather_elements(res.data(), coord_list);
680 vol *= cmax[d] - cmin[d] + 1;
681 assert(cmax[d] >= cmin[d] && cmin[d] >= 0 && cmax[d] < lattice.
size(d));
683 std::vector<CoordinateVector> clist(vol);
687 forcoordinaterange(c, cmin, cmax) {
690 return get_elements(clist, bcast);
700 cmax[d] = lattice.
size(d) - 1;
703 cmin[d] = cmax[d] = c[d];
705 return get_subvolume(cmin, cmax, bcast);
721 buffer.resize(lattice->mynode.volume);
722#if defined(CUDA) || defined(HIP)
724 T *data = (T *)d_malloc(
sizeof(T) * lattice->mynode.volume);
726 T *data = buffer.data();
729#pragma hila novector direct_access(data)
732 nodec = X.coordinates() - nmin;
734 unsigned i = nodec.
dot(nmul);
735 data[i] = (*this)[X];
738#if defined(CUDA) || defined(HIP)
739 gpuMemcpy(buffer.data(), data,
sizeof(T) * lattice->mynode.volume, gpuMemcpyDeviceToHost);
755 assert(buffer.size() >= lattice->mynode.volume);
757#if defined(CUDA) || defined(HIP)
759 T *data = (T *)d_malloc(
sizeof(T) * lattice->mynode.volume);
760 gpuMemcpy(data, buffer.data(),
sizeof(T) * lattice->mynode.volume, gpuMemcpyHostToDevice);
762 T *data = buffer.data();
765#pragma hila novector direct_access(data)
768 nodec = X.coordinates() - nmin;
770 unsigned i = nodec.
dot(nmul);
771 (*this)[X] = data[i];
774#if defined(CUDA) || defined(HIP)
778 this->mark_changed(
ALL);
796 for (
int i = 1; i < NDIM; i++)
797 nmul.
e(i) = nmul.
e(i - 1) * (lattice->mynode.size[i - 1] + 2);
803 node_min[d] = lattice->mynode.min[d];
804 node_max[d] = lattice->mynode.min[d] + lattice->mynode.size[d] - 1;
807#pragma hila novector direct_access(data)
813 for (
int i = 0; i < ndir; i++) {
815 if (c.
e(d) == node_min[d]) {
817 }
else if (c.
e(d) == node_max[d]) {
824 if (gotit && c.
e(d) == node_min[d]) {
827 data[nodec.
dot(nmul)] = src[X - d];
828 dest[X] = src[X - d];
830 }
else if (gotit && c.
e(d) == node_max[d]) {
833 data[nodec.
dot(nmul)] = src[X + d];
834 dest[X] = src[X + d];
852 for (
int i = 1; i < NDIM; i++)
853 nmul.
e(i) = nmul.
e(i - 1) * (lattice->mynode.size[i - 1] + 2);
857 foralldir(d) siz *= (lattice->mynode.size[d] + 2);
860#if defined(CUDA) || defined(HIP)
862 T *data = (T *)d_malloc(
sizeof(T) * siz);
864 T *data = buffer.data();
868#pragma hila novector direct_access(data)
871 nodec = X.coordinates() - nmin;
873 unsigned i = nodec.
dot(nmul);
874 data[i] = (*this)[X];
894 for (
int d2 = d1 + 1; d2 < NDIM; ++d2) {
898 for (
int d3 = d2 + 1; d3 < NDIM; ++d3) {
902 for (
int d4 = d3 + 1; d4 < NDIM; ++d4) {
913#if defined(CUDA) || defined(HIP)
914 gpuMemcpy(buffer.data(), data,
sizeof(T) * siz, gpuMemcpyDeviceToHost);
922 assert(orig.
is_initialized(
ALL) &&
"block_from()-method field is not initialized");
929 assert(blocklat->parent == parentlat &&
"blocking must happen from parent lattice Field");
932 if (blocklat->mynode.volume == 0)
937 size_t bufsize = blocklat->mynode.volume;
938 T *buf = (T *)d_malloc(bufsize *
sizeof(T));
945 auto size_factor = blocklat->mynode.size_factor;
947#pragma hila direct_access(buf)
949 if (X.coordinates().is_divisible(blockfactor)) {
952 buf[cv.
dot(size_factor)] = orig[X];
958#pragma hila direct_access(buf)
962 (*this)[X] = buf[cv.
dot(size_factor)];
972 assert(this->is_initialized(
ALL) &&
"unblock_to()-method field is not initialized");
979 assert(blocklat->parent == parentlat &&
"unblocking must happen to parent lattice Field");
982 if (blocklat->mynode.volume == 0)
986 size_t bufsize = blocklat->mynode.volume;
987 T *buf = (T *)d_malloc(bufsize *
sizeof(T));
991 auto size_factor = blocklat->mynode.size_factor;
995#pragma hila direct_access(buf)
999 buf[cv.
dot(size_factor)] = (*this)[X];
1004#pragma hila direct_access(buf)
1006 if (X.coordinates().is_divisible(blockfactor)) {
1009 target[X] = buf[cv.
dot(size_factor)];
CoordinateVector_t element_div(const CoordinateVector_t &m) const
Element-by-element division See also .mod(), which does element-by-element positive mod.
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
std::vector< T > get_subvolume(const CoordinateVector &cmin, const CoordinateVector &cmax, bool broadcast=false) const
Get a subvolume of the field elements to all nodes.
Field< T > & shift(const CoordinateVector &v, Field< T > &r, Parity par) const
Create a periodically shifted copy of the field.
bool is_initialized(Parity p) const
Returns true if the Field has been assigned to.
field_struct *__restrict__ fs
Field::fs holds all of the field content in Field::field_struct.
void check_alloc()
Allocate Field if it is not already allocated.
dir_mask_t start_gather(Direction d, Parity p=ALL) const
std::vector< T > get_elements(const std::vector< CoordinateVector > &coord_list, bool broadcast=false) const
Get a list of elements and store them into a vector.
std::vector< T > get_slice(const CoordinateVector &c, bool broadcast=false) const
Get a slice (subvolume)
void unblock_to(Field< T > &target) const
Copy content to the argument Field on blocked (sparse) sites. a.unblock_to(b) is the inverse of a....
int size(Direction d) const
lattice.size() -> CoordinateVector or lattice.size(d) -> int returns the dimensions of the lattice,...
lattice_struct * ptr() const
get non-const pointer to lattice_struct (cf. operator ->)
Lattice switch_to(lattice_struct *lat)
With a valid lattice_struct pointer, switch this lattice to be active.
T e(const int i, const int j) const
Standard array indexing operation for matrices.
R dot(const Matrix< p, q, S > &rhs) const
Dot product.
const Array< n, m, T > & asArray() const
Cast Matrix to Array.
Matrix class which defines matrix operations.
bool is_on_mynode(const CoordinateVector &c) const
Is the coordinate on THIS node ?
std::array< nn_comminfo_struct, NDIRS > nn_comminfo
nearest neighbour comminfo struct
unsigned site_index(const CoordinateVector &c) const
int node_rank(const CoordinateVector &c) const
T abs(const Complex< T > &a)
Return absolute value of Complex number.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
#define foralldir(d)
Macro to loop over (all) Direction(s)
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
constexpr Parity ALL
bit pattern: 011
This files containts definitions for the Field class and the classes required to define it such as fi...
void collect_field_halo_data_(T *data, const Field< T > &src, Field< T > &dest, const Vector< 4, int > &dirs, int ndir)
Copy local data with halo - useful for visualization.
int myrank()
rank of this node
std::ostream out
this is our default output file stream
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).
void terminate(int status)
is_vectorizable_type<T>::value is always false if the target is not vectorizable
Information necessary to communicate with a node.
nn-communication has only 1 node to talk to
bool is_on_edge(Direction d) const
true if this node is on the edge of the lattice to dir d