1#ifndef HILA_REDUCTIONVECTOR_H_
2#define HILA_REDUCTIONVECTOR_H_
15inline void _hila_init_gpu_ops_vectorreduction() {
71 bool comm_is_on =
false;
74 bool is_allreduce_ =
true;
75 bool is_nonblocking_ =
false;
76 bool is_delayed_ =
false;
78 bool delay_is_on =
false;
79 bool is_delayed_sum =
true;
83 void reduce_operation(MPI_Op operation) {
92 dtype = get_MPI_number_type<T>();
94 if (dtype == MPI_BYTE) {
95 assert(
sizeof(T) < 0 &&
"Unknown number_type in vector reduction");
98 reduction_timer.start();
100 if (is_nonblocking_) {
101 MPI_Iallreduce(MPI_IN_PLACE, (
void *)val.data(),
102 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
103 operation, lattice->mpi_comm_lat, &request);
105 MPI_Allreduce(MPI_IN_PLACE, (
void *)val.data(),
106 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
107 operation, lattice->mpi_comm_lat);
111 if (is_nonblocking_) {
112 MPI_Ireduce(MPI_IN_PLACE, (
void *)val.data(),
113 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
114 operation, 0, lattice->mpi_comm_lat, &request);
116 MPI_Reduce(MPI_IN_PLACE, (
void *)val.data(),
117 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
118 operation, 0, lattice->mpi_comm_lat);
121 if (is_nonblocking_) {
122 MPI_Ireduce((
void *)val.data(), (
void *)val.data(),
123 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
124 operation, 0, lattice->mpi_comm_lat, &request);
126 MPI_Reduce((
void *)val.data(), (
void *)val.data(),
127 sizeof(T) * val.size() /
sizeof(hila::arithmetic_type<T>), dtype,
128 operation, 0, lattice->mpi_comm_lat);
132 reduction_timer.stop();
137 using iterator =
typename std::vector<T>::iterator;
138 using const_iterator =
typename std::vector<T>::const_iterator;
146 const_iterator begin()
const {
149 const_iterator end()
const {
163 _hila_init_gpu_ops_vectorreduction<T>();
181 bool is_allreduce() {
182 return is_allreduce_;
190 bool is_nonblocking() {
191 return is_nonblocking_;
205 template <typename S, std::enable_if_t<std::is_assignable<T &, S>::value,
int> = 0>
242 if (delay_is_on && is_delayed_sum ==
false) {
243 assert(0 &&
"Cannot mix sum and product reductions!");
247 reduce_operation(MPI_SUM);
257 static_assert(std::is_same<T, int>::value || std::is_same<T, long>::value ||
258 std::is_same<T, float>::value || std::is_same<T, double>::value ||
259 std::is_same<T, long double>::value,
260 "Type not implemented for product reduction");
263 if (delay_is_on && is_delayed_sum ==
true) {
264 assert(0 &&
"Cannot mix sum and product reductions!");
268 reduce_operation(MPI_PROD);
276 reduction_wait_timer.start();
278 MPI_Wait(&request, &status);
279 reduction_wait_timer.stop();
290 reduce_operation(MPI_SUM);
292 reduce_operation(MPI_PROD);
307 std::vector<T> vector() {
317 void resize(
size_t count) {
320 void resize(
size_t count,
const T &v) {
321 val.resize(count, v);
328 void push_back(
const T &v) {
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
T & operator[](const int i)
And access operators - these do in practice everything already!
size_t size() const
methods from std::vector:
ReductionVector & delayed(bool b=true)
deferred(bool) turns deferred on or off. By default turns on.
void start_reduce()
For delayed reduction, reduce starts or completes the reduction operation.
void reduce()
Complete non-blocking or delayed reduction.
void init_product()
Init is to be called before every site loop.
~ReductionVector()
Destructor cleans up communications if they are in progress.
void operator=(const S &rhs)
ReductionVector & nonblocking(bool b=true)
nonblocking(bool) turns allreduce on or off. By default on.
void operator=(std::nullptr_t np)
Assignment from 0.
void init_sum()
Init is to be called before every site loop.
ReductionVector & allreduce(bool b=true)
allreduce(bool) turns allreduce on or off. By default on.
void wait()
Wait for MPI to complete, if it is currently going on.
T * data()
data() returns ptr to the raw storage
constexpr Parity ALL
bit pattern: 011
int myrank()
rank of this node