21 bool comm_is_on =
false;
24 bool is_allreduce_ =
true;
25 bool is_nonblocking_ =
false;
26 bool is_delayed_ =
false;
28 bool delay_is_on =
false;
29 bool is_delayed_sum =
true;
35 void do_reduce_operation(MPI_Op operation) {
45 dtype = get_MPI_number_type<T>();
47 assert(dtype != MPI_BYTE &&
"Unknown number_type in reduction");
51 reduction_timer.start();
53 if (is_nonblocking()) {
54 MPI_Iallreduce(MPI_IN_PLACE, ptr,
55 sizeof(T) /
sizeof(hila::arithmetic_type<T>), dtype,
56 operation, lattice.mpi_comm_lat, &request);
58 MPI_Allreduce(MPI_IN_PLACE, ptr,
59 sizeof(T) /
sizeof(hila::arithmetic_type<T>), dtype,
60 operation, lattice.mpi_comm_lat);
64 if (is_nonblocking()) {
65 MPI_Ireduce(MPI_IN_PLACE, ptr,
66 sizeof(T) /
sizeof(hila::arithmetic_type<T>), dtype,
67 operation, 0, lattice.mpi_comm_lat, &request);
69 MPI_Reduce(MPI_IN_PLACE, ptr,
70 sizeof(T) /
sizeof(hila::arithmetic_type<T>), dtype,
71 operation, 0, lattice.mpi_comm_lat);
74 if (is_nonblocking()) {
75 MPI_Ireduce(ptr, ptr,
sizeof(T) /
sizeof(hila::arithmetic_type<T>),
76 dtype, operation, 0, lattice.mpi_comm_lat, &request);
78 MPI_Reduce(ptr, ptr,
sizeof(T) /
sizeof(hila::arithmetic_type<T>),
79 dtype, operation, 0, lattice.mpi_comm_lat);
83 reduction_timer.stop();
91 reduction_wait_timer.start();
93 MPI_Wait(&request, &status);
94 reduction_wait_timer.stop();
108 template <typename S, std::enable_if_t<hila::is_assignable<T &, S>::value,
int> = 0>
132 bool is_allreduce() {
133 return is_allreduce_;
141 bool is_nonblocking() {
142 return is_nonblocking_;
162 template <typename S, std::enable_if_t<hila::is_assignable<T &, S>::value,
int> = 0>
168 template <typename S, std::enable_if_t<hila::is_assignable<T &, S>::value,
int> = 0>
187 template <
typename S,
188 std::enable_if_t<hila::is_assignable<T &, hila::type_plus<T, S>>
::value,
210 void reduce_sum_node(
const T &v) {
221 if (delay_is_on && is_delayed_sum ==
false) {
222 assert(0 &&
"Cannot mix sum and product reductions!");
225 is_delayed_sum =
true;
227 do_reduce_operation(MPI_SUM);
263 do_reduce_operation(MPI_SUM);
265 do_reduce_operation(MPI_PROD);
298 result += (*this)[X];
299 return result.
value();
304 static_assert(std::is_arithmetic<T>::value,
305 ".product() reduction only for integer or floating point types");
310 result *= (*this)[X];
311 return result.
value();
321#if defined(CUDA) || defined(HIP)
322#include "backend_gpu/gpu_reduction.h"
329 std::is_same<T, int>::value || std::is_same<T, long>::value ||
330 std::is_same<T, float>::value || std::is_same<T, double>::value ||
331 std::is_same<T, long double>::value,
332 "In Field .min() and .max() methods the Field element type must be one of "
333 "(int/long/float/double/long double)");
335#if defined(CUDA) || defined(HIP)
336 T val = gpu_minmax(is_min, par, loc);
338 int sgn = is_min ? 1 : -1;
340 T val = is_min ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
344#pragma omp parallel shared(val, loc, sgn, is_min)
348 is_min ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
352#pragma hila novector omp_parallel_region direct_access(loc_th, val_th)
354 if (sgn * (*
this)[X] < sgn * val_th) {
356 loc_th = X.coordinates();
361 if (sgn * val_th < sgn * val) {
370 MPI_Datatype dtype = get_MPI_number_type<T>(size,
true);
382 MPI_Allreduce(MPI_IN_PLACE, &rdata, 1, dtype, MPI_MINLOC,
383 lattice.mpi_comm_lat);
385 MPI_Allreduce(MPI_IN_PLACE, &rdata, 1, dtype, MPI_MAXLOC,
386 lattice.mpi_comm_lat);
392 lattice.mpi_comm_lat);
403 return minmax(
true, par, loc);
409 return minmax(
true,
ALL, loc);
415 return minmax(
true, par, loc);
423 return minmax(
false, par, loc);
429 return minmax(
false,
ALL, loc);
435 return minmax(
false, par, loc);
T max(Parity par=ALL) const
Find maximum value from Field.
T product(Parity par=Parity::all, bool allreduce=true) const
Product reduction of Field.
T minmax(bool is_min, Parity par, CoordinateVector &loc) const
Function to perform min or max operations.
T min(Parity par=ALL) const
Find minimum value from Field.
T sum(Parity par=Parity::all, bool allreduce=true) const
Sum reduction of Field.
Special reduction class: enables delayed and non-blocking reductions, which are not possible with the...
const T value()
Return value of the reduction variable. Wait for the comms if needed.
~Reduction()
Destructor cleans up communications if they are in progress.
T operator=(const S &rhs)
Assignment is used only outside site loops - drop comms if on, no need to wait.
void operator+=(const S &rhs)
Reduction(const Reduction< T > &r)=delete
Reduction & allreduce(bool b=true)
allreduce(bool) turns allreduce on or off. By default on.
Reduction & nonblocking(bool b=true)
nonblocking(bool) turns allreduce on or off. By default on.
Reduction & delayed(bool b=true)
deferred(bool) turns deferred on or off. By default turns on.
void set(const S &rhs)
Method set is the same as assignment, but without return value.
void reduce()
Complete the reduction - start if not done, and wait if ongoing.
void start_reduce()
For delayed reduction, start_reduce starts or completes the reduction operation.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity ALL
bit pattern: 011
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are