20#ifdef SPECIAL_BOUNDARY_CONDITIONS 
   23    if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(-d)) {
 
   24        payload.gather_comm_elements(buffer, to_node, par, lattice, 
true);
 
   26        payload.gather_comm_elements(buffer, to_node, par, lattice, 
false);
 
   29    payload.gather_comm_elements(buffer, to_node, par, lattice, 
false);
 
   34    bool antiperiodic = 
false;
 
   35#ifdef SPECIAL_BOUNDARY_CONDITIONS 
   36    if (boundary_condition[d] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(-d)) {
 
   43        if (vector_lattice->is_boundary_permutation[
abs(d)]) {
 
   46            const unsigned *index_list = to_node.get_sitelist(par, n);
 
   48                payload.gather_elements(buffer, index_list, n, lattice);
 
   50                payload.gather_elements_negated(buffer, index_list, n, lattice);
 
   54            payload.gather_comm_vectors(buffer, to_node, par, vector_lattice, antiperiodic);
 
   59        const unsigned *index_list = to_node.get_sitelist(par, n);
 
   61            payload.gather_elements(buffer, index_list, n, lattice);
 
   63            payload.gather_elements_negated(buffer, index_list, n, lattice);
 
   80        if (vector_lattice->is_boundary_permutation[
abs(d)]) {
 
   81            payload.place_recv_elements(buffer, d, par, vector_lattice);
 
   90    payload.place_comm_elements(d, par, buffer, from_node, lattice);
 
  101#ifdef SPECIAL_BOUNDARY_CONDITIONS 
  103        (boundary_condition[dir] == hila::bc::ANTIPERIODIC && lattice->mynode.
is_on_edge(dir));
 
  105    bool antiperiodic = 
false;
 
  107    payload.set_local_boundary_elements(dir, par, lattice, antiperiodic);
 
  119    return (T *)payload.get_buffer() + from_node.offset(par);
 
  121#elif defined(CUDA) || defined(HIP) 
  125        offs = from_node.evensites;
 
  126    if (receive_buffer[d] == 
nullptr) {
 
  127        receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
 
  129    return receive_buffer[d] + offs;
 
  131#elif defined(VECTORIZED) 
  135        return (T *)payload.get_buffer() + from_node.offset(par);
 
  139            offs = from_node.evensites;
 
  141        if (vector_lattice->is_boundary_permutation[
abs(d)]) {
 
  143            if (receive_buffer[d] == 
nullptr) {
 
  144                receive_buffer[d] = payload.allocate_mpi_buffer(from_node.sites);
 
  146            return receive_buffer[d] + offs;
 
  150            return ((T *)payload.get_buffer() +
 
  151                    (vector_lattice->halo_offset[d] * vector_size + offs));
 
  159#if defined(NAIVE_SHIFT) 
  181        par_s = opp_parity(par);
 
  186    bool found_dir = 
false;
 
  189        if (rem[d] > 0 && gather_status(par_s, d) != gather_status_t::NOT_DONE) {
 
  193        } 
else if (rem[d] < 0 && gather_status(par_s, -d) != gather_status_t::NOT_DONE) {
 
  206            } 
else if (rem[d] < 0) {
 
  215        res[par_s] = (*this)[X + mdir];
 
  232    (*from)[par_s] = (*this)[X + mdir];
 
  236    par_s = opp_parity(par_s);
 
  240            mdir = (rem[d] > 0) ? d : -d;
 
  242            while (rem[d] != 0) {
 
  244                (*to)[par_s] = (*from)[X + mdir];
 
  246                par_s = opp_parity(par_s);
 
  268    int tag = get_next_msg_tag();
 
  276    if (is_gathered(d, p)) {
 
  277        hila::n_gather_avoided++;
 
  285        fs->set_local_boundary_elements(d, p);
 
  291    if (!gather_not_done(d, p) || !gather_not_done(d, 
ALL)) {
 
  292        hila::n_gather_avoided++;
 
  293        return get_dir_mask(d); 
 
  300        if (!gather_not_done(d, 
EVEN) && !gather_not_done(d, 
ODD)) {
 
  302            hila::n_gather_avoided++;
 
  303            return get_dir_mask(d);
 
  305        if (!gather_not_done(d, 
EVEN))
 
  307        else if (!gather_not_done(d, 
ODD))
 
  312    mark_gather_started(d, par);
 
  316    int par_i = 
static_cast<int>(par) - 1; 
 
  318    constexpr size_t size = 
sizeof(T);
 
  323    if (from_node.rank != 
hila::myrank() && boundary_need_to_communicate(d)) {
 
  328        receive_buffer = fs->get_receive_buffer(d, par, from_node);
 
  330        size_t n = from_node.n_sites(par) * size;
 
  332        if (n >= (1ULL << 31)) {
 
  333            hila::out << 
"Too large MPI message!  Size " << n << 
'\n';
 
  337        post_receive_timer.start();
 
  340        MPI_Irecv(receive_buffer, (
int)n, MPI_BYTE, from_node.rank, tag, lattice->mpi_comm_lat,
 
  341                  &fs->receive_request[par_i][d]);
 
  343        post_receive_timer.stop();
 
  346    if (to_node.rank != 
hila::myrank() && boundary_need_to_communicate(-d)) {
 
  349        unsigned sites = to_node.n_sites(par);
 
  351        if (fs->send_buffer[d] == 
nullptr)
 
  352            fs->send_buffer[d] = fs->payload.allocate_mpi_buffer(to_node.sites);
 
  354        send_buffer = fs->send_buffer[d] + to_node.offset(par);
 
  356#ifndef MPI_BENCHMARK_TEST 
  357        fs->gather_comm_elements(d, par, send_buffer, to_node);
 
  360        size_t n = sites * size;
 
  363        gpuStreamSynchronize(0);
 
  367        start_send_timer.start();
 
  369        MPI_Isend(send_buffer, (
int)n, MPI_BYTE, to_node.rank, tag, lattice->mpi_comm_lat,
 
  370                  &fs->send_request[par_i][d]);
 
  372        start_send_timer.stop();
 
  379#ifndef MPI_BENCHMARK_TEST 
  380    fs->set_local_boundary_elements(d, par);
 
  383    return get_dir_mask(d);
 
  402    if (is_gathered(d, p))
 
  420    assert(!(p != 
ALL && is_gather_started(d, p) && is_gather_started(d, 
ALL)));
 
  425    if (is_gather_started(d, p))
 
  428        if (is_gather_started(d, 
ALL))
 
  435        if (is_gathered(d, 
EVEN) && is_gather_started(d, 
ODD))
 
  437        else if (is_gathered(d, 
ODD) && is_gather_started(d, 
EVEN))
 
  439        else if (is_gather_started(d, 
EVEN) && is_gather_started(d, 
ODD)) {
 
  447    for (
int wait_i = 0; wait_i < n_wait; ++wait_i) {
 
  449        int par_i = (int)par - 1;
 
  451        if (from_node.rank != 
hila::myrank() && boundary_need_to_communicate(d)) {
 
  452            wait_receive_timer.start();
 
  455            MPI_Wait(&fs->receive_request[par_i][d], &status);
 
  457            wait_receive_timer.stop();
 
  459#if !defined(VANILLA) && !defined(MPI_BENCHMARK_TEST) 
  460            fs->place_comm_elements(d, par, fs->get_receive_buffer(d, par, from_node), from_node);
 
  465        if (to_node.rank != 
hila::myrank() && boundary_need_to_communicate(-d)) {
 
  466            wait_send_timer.start();
 
  468            MPI_Wait(&fs->send_request[par_i][d], &status);
 
  469            wait_send_timer.stop();
 
  473        mark_gathered(d, par);
 
  476        hila::n_gather_done += 1;
 
  478        par = opp_parity(par); 
 
  487                                             const std::vector<CoordinateVector> &coord_list,
 
  490    std::vector<unsigned> index_list;
 
  491    std::vector<int> sites_on_rank(lattice->nodes.number);
 
  492    std::vector<int> reshuffle_list(coord_list.size());
 
  494    std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
 
  505        if (sites_on_rank[rank] == 0 && rank != root)
 
  507        sites_on_rank[rank]++;
 
  508        reshuffle_list[i++] = rank;
 
  511    std::vector<T> send_buffer(index_list.size());
 
  512    payload.gather_elements((T *)send_buffer.data(), index_list.data(), send_buffer.size(),
 
  515        MPI_Send((
char *)send_buffer.data(), sites_on_rank[
hila::myrank()] * 
sizeof(T), MPI_BYTE,
 
  522        std::vector<T> pb(coord_list.size() - sites_on_rank[root]);
 
  525        std::vector<T *> nptr(lattice->nodes.number);
 
  527        std::vector<MPI_Request> mpi_req(nranks);
 
  529        for (
int n = 0; n < sites_on_rank.size(); n++) {
 
  530            if (sites_on_rank[n] > 0) {
 
  533                    MPI_Irecv(b, (
int)(sites_on_rank[n] * 
sizeof(T)), MPI_BYTE, n, n,
 
  534                              lattice->mpi_comm_lat, &mpi_req[nreqs++]);
 
  537                    b += sites_on_rank[n];
 
  541                    nptr[n] = send_buffer.data();
 
  547            std::vector<MPI_Status> stat_arr(nreqs);
 
  548            MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
 
  552        for (
int i = 0; i < coord_list.size(); i++) {
 
  553            buffer[i] = *nptr[reshuffle_list[i]];
 
  554            nptr[reshuffle_list[i]]++;
 
  564                                              const std::vector<CoordinateVector> &coord_list,
 
  567    std::vector<unsigned> index_list;
 
  568    std::vector<int> sites_on_rank(lattice->nodes.number);
 
  569    std::vector<int> reshuffle_list(coord_list.size());
 
  570    std::fill(sites_on_rank.begin(), sites_on_rank.end(), 0);
 
  580        if (sites_on_rank[rank] == 0 && rank != root)
 
  582        sites_on_rank[rank]++;
 
  583        reshuffle_list[i++] = rank;
 
  590        std::vector<T> recv_buffer(sites_on_rank[
hila::myrank()]);
 
  593        MPI_Recv((
char *)recv_buffer.data(), sites_on_rank[
hila::myrank()] * 
sizeof(T), MPI_BYTE,
 
  596        payload.place_elements((T *)recv_buffer.data(), index_list.data(), recv_buffer.size(),
 
  601        std::vector<T> pb(coord_list.size());
 
  603        std::vector<unsigned> nloc(lattice->nodes.number);
 
  604        std::vector<unsigned> ncount(lattice->nodes.number);
 
  605        nloc[0] = ncount[0] = 0;
 
  607        for (
int n = 1; n < lattice->nodes.number; n++) {
 
  608            nloc[n] = nloc[n - 1] + sites_on_rank[n - 1];
 
  611        for (
size_t i = 0; i < coord_list.size(); i++) {
 
  612            int node = reshuffle_list[i];
 
  613            pb[nloc[node] + ncount[node]] = buffer[i];
 
  616        std::vector<MPI_Request> mpi_req(nranks);
 
  618        for (
int n = 0; n < sites_on_rank.size(); n++) {
 
  619            if (sites_on_rank[n] > 0) {
 
  621                    MPI_Isend(pb.data() + nloc[n], (
int)(sites_on_rank[n] * 
sizeof(T)), MPI_BYTE, n,
 
  622                              n, lattice->mpi_comm_lat, &mpi_req[nreqs++]);
 
  626        if(sites_on_rank[root] > 0) {
 
  627            payload.place_elements(pb.data() + nloc[root], index_list.data(), index_list.size(),
 
  631            std::vector<MPI_Status> stat_arr(nreqs);
 
  632            MPI_Waitall(nreqs, mpi_req.data(), stat_arr.data());
 
  640                            const std::vector<CoordinateVector> &coord_list) {
 
  641    assert(elements.size() == coord_list.size() && 
"vector size mismatch in set_elments");
 
  642    std::vector<unsigned> my_indexes;
 
  643    std::vector<T> my_elements;
 
  644    for (
int i = 0; i < coord_list.size(); i++) {
 
  648            my_elements.push_back(elements[i]);
 
  651    fs->payload.place_elements(my_elements.data(), my_indexes.data(), my_indexes.size(), lattice);
 
  663        res.resize(coord_list.size());
 
  665    fs->gather_elements(res.data(), coord_list);
 
  680        vol *= cmax[d] - cmin[d] + 1;
 
  681        assert(cmax[d] >= cmin[d] && cmin[d] >= 0 && cmax[d] < lattice.
size(d));
 
  683    std::vector<CoordinateVector> clist(vol);
 
  687    forcoordinaterange(c, cmin, cmax) {
 
  690    return get_elements(clist, bcast);
 
  700        cmax[d] = lattice.
size(d) - 1;
 
  703        cmin[d] = cmax[d] = c[d];
 
  705    return get_subvolume(cmin, cmax, bcast);
 
  721    buffer.resize(lattice->mynode.volume);
 
  722#if defined(CUDA) || defined(HIP) 
  724    T *data = (T *)d_malloc(
sizeof(T) * lattice->mynode.volume);
 
  726    T *data = buffer.data();
 
  729#pragma hila novector direct_access(data) 
  732        nodec = X.coordinates() - nmin;
 
  734        unsigned i = nodec.
dot(nmul);
 
  735        data[i] = (*this)[X];
 
  738#if defined(CUDA) || defined(HIP) 
  739    gpuMemcpy(buffer.data(), data, 
sizeof(T) * lattice->mynode.volume, gpuMemcpyDeviceToHost);
 
  755    assert(buffer.size() >= lattice->mynode.volume);
 
  757#if defined(CUDA) || defined(HIP) 
  759    T *data = (T *)d_malloc(
sizeof(T) * lattice->mynode.volume);
 
  760    gpuMemcpy(data, buffer.data(), 
sizeof(T) * lattice->mynode.volume, gpuMemcpyHostToDevice);
 
  762    T *data = buffer.data();
 
  765#pragma hila novector direct_access(data) 
  768        nodec = X.coordinates() - nmin;
 
  770        unsigned i = nodec.
dot(nmul);
 
  771        (*this)[X] = data[i];
 
  774#if defined(CUDA) || defined(HIP) 
  778    this->mark_changed(
ALL);
 
  796    for (
int i = 1; i < NDIM; i++)
 
  797        nmul.
e(i) = nmul.
e(i - 1) * (lattice->mynode.size[i - 1] + 2);
 
  803        node_min[d] = lattice->mynode.min[d];
 
  804        node_max[d] = lattice->mynode.min[d] + lattice->mynode.size[d] - 1;
 
  807#pragma hila novector direct_access(data) 
  813        for (
int i = 0; i < ndir; i++) {
 
  815            if (c.
e(d) == node_min[d]) {
 
  817            } 
else if (c.
e(d) == node_max[d]) {
 
  824        if (gotit && c.
e(d) == node_min[d]) {
 
  827            data[nodec.
dot(nmul)] = src[X - d];
 
  828            dest[X] = src[X - d];
 
  830        } 
else if (gotit && c.
e(d) == node_max[d]) {
 
  833            data[nodec.
dot(nmul)] = src[X + d];
 
  834            dest[X] = src[X + d];
 
  852    for (
int i = 1; i < NDIM; i++)
 
  853        nmul.
e(i) = nmul.
e(i - 1) * (lattice->mynode.size[i - 1] + 2);
 
  857    foralldir(d) siz *= (lattice->mynode.size[d] + 2);
 
  860#if defined(CUDA) || defined(HIP) 
  862    T *data = (T *)d_malloc(
sizeof(T) * siz);
 
  864    T *data = buffer.data();
 
  868#pragma hila novector direct_access(data) 
  871        nodec = X.coordinates() - nmin;
 
  873        unsigned i = nodec.
dot(nmul);
 
  874        data[i] = (*this)[X];
 
  894        for (
int d2 = d1 + 1; d2 < NDIM; ++d2) {
 
  898            for (
int d3 = d2 + 1; d3 < NDIM; ++d3) {
 
  902                for (
int d4 = d3 + 1; d4 < NDIM; ++d4) {
 
  913#if defined(CUDA) || defined(HIP) 
  914    gpuMemcpy(buffer.data(), data, 
sizeof(T) * siz, gpuMemcpyDeviceToHost);
 
  922    assert(orig.
is_initialized(
ALL) && 
"block_from()-method field is not initialized");
 
  929    assert(blocklat->parent == parentlat && 
"blocking must happen from parent lattice Field");
 
  932    if (blocklat->mynode.volume == 0)
 
  937    size_t bufsize = blocklat->mynode.volume;
 
  938    T *buf = (T *)d_malloc(bufsize * 
sizeof(T));
 
  945    auto size_factor = blocklat->mynode.size_factor;
 
  947#pragma hila direct_access(buf) 
  949        if (X.coordinates().is_divisible(blockfactor)) {
 
  952            buf[cv.
dot(size_factor)] = orig[X];
 
  958#pragma hila direct_access(buf) 
  962        (*this)[X] = buf[cv.
dot(size_factor)];
 
  972    assert(this->is_initialized(
ALL) && 
"unblock_to()-method field is not initialized");
 
  979    assert(blocklat->parent == parentlat && 
"unblocking must happen to parent lattice Field");
 
  982    if (blocklat->mynode.volume == 0)
 
  986    size_t bufsize = blocklat->mynode.volume;
 
  987    T *buf = (T *)d_malloc(bufsize * 
sizeof(T));
 
  991    auto size_factor = blocklat->mynode.size_factor;
 
  995#pragma hila direct_access(buf) 
  999        buf[cv.
dot(size_factor)] = (*this)[X];
 
 1004#pragma hila direct_access(buf) 
 1006        if (X.coordinates().is_divisible(blockfactor)) {
 
 1009            target[X] = buf[cv.
dot(size_factor)];
 
CoordinateVector_t element_div(const CoordinateVector_t &m) const
Element-by-element division See also .mod(), which does element-by-element positive mod.
 
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
 
std::vector< T > get_subvolume(const CoordinateVector &cmin, const CoordinateVector &cmax, bool broadcast=false) const
Get a subvolume of the field elements to all nodes.
 
Field< T > & shift(const CoordinateVector &v, Field< T > &r, Parity par) const
Create a periodically shifted copy of the field.
 
bool is_initialized(Parity p) const
Returns true if the Field has been assigned to.
 
field_struct *__restrict__ fs
Field::fs holds all of the field content in Field::field_struct.
 
void check_alloc()
Allocate Field if it is not already allocated.
 
dir_mask_t start_gather(Direction d, Parity p=ALL) const
 
std::vector< T > get_elements(const std::vector< CoordinateVector > &coord_list, bool broadcast=false) const
Get a list of elements and store them into a vector.
 
std::vector< T > get_slice(const CoordinateVector &c, bool broadcast=false) const
Get a slice (subvolume)
 
void unblock_to(Field< T > &target) const
Copy content to the argument Field on blocked (sparse) sites. a.unblock_to(b) is the inverse of a....
 
int size(Direction d) const
lattice.size() -> CoordinateVector or lattice.size(d) -> int returns the dimensions of the lattice,...
 
lattice_struct * ptr() const
get non-const pointer to lattice_struct (cf. operator ->)
 
Lattice switch_to(lattice_struct *lat)
With a valid lattice_struct pointer, switch this lattice to be active.
 
T e(const int i, const int j) const
Standard array indexing operation for matrices.
 
R dot(const Matrix< p, q, S > &rhs) const
Dot product.
 
const Array< n, m, T > & asArray() const
Cast Matrix to Array.
 
Matrix class which defines matrix operations.
 
bool is_on_mynode(const CoordinateVector &c) const
Is the coordinate on THIS node ?
 
std::array< nn_comminfo_struct, NDIRS > nn_comminfo
nearest neighbour comminfo struct
 
unsigned site_index(const CoordinateVector &c) const
 
int node_rank(const CoordinateVector &c) const
 
T abs(const Complex< T > &a)
Return absolute value of Complex number.
 
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
 
constexpr Parity EVEN
bit pattern: 001
 
#define foralldir(d)
Macro to loop over (all) Direction(s)
 
constexpr Parity ODD
bit pattern: 010
 
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
 
constexpr Parity ALL
bit pattern: 011
 
This files containts definitions for the Field class and the classes required to define it such as fi...
 
void collect_field_halo_data_(T *data, const Field< T > &src, Field< T > &dest, const Vector< 4, int > &dirs, int ndir)
Copy local data with halo - useful for visualization.
 
int myrank()
rank of this node
 
std::ostream out
this is our default output file stream
 
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).
 
void terminate(int status)
 
is_vectorizable_type<T>::value is always false if the target is not vectorizable
 
Information necessary to communicate with a node.
 
nn-communication has only 1 node to talk to
 
bool is_on_edge(Direction d) const
true if this node is on the edge of the lattice to dir d