1#ifndef HILA_CLUSTERS_H_
2#define HILA_CLUSTERS_H_
83#define CLUSTER_BACKGROUND_ 0xFF
85template <typename inttype, std::enable_if_t<std::is_integral<inttype>::value,
int> = 0>
86inline uint64_t set_cl_label(
const CoordinateVector &cv,
const inttype type) {
89 if (type == CLUSTER_BACKGROUND_)
90 r =
static_cast<uint64_t
>(CLUSTER_BACKGROUND_) << 54;
92 r =
SiteIndex(cv).value | (
static_cast<uint64_t
>(type) << 54);
97inline uint8_t get_cl_label_type(
const uint64_t v) {
98 return 0xff & (v >> 54);
112 std::vector<cl_struct> clist;
118 inline void make_local_clist();
120 void assert_cl_index(
size_t i)
const {
121 if (i >= clist.size()) {
122 hila::out0 <<
"Too large cluster index " << i <<
", there are only " << clist.size()
132 static constexpr uint8_t background = CLUSTER_BACKGROUND_;
134 clusters() =
default;
135 ~clusters() =
default;
137 template <typename inttype, std::enable_if_t<std::is_integral<inttype>::value,
int> = 0>
145 template <typename inttype, std::enable_if_t<std::is_integral<inttype>::value,
int> = 0>
153 size_t number()
const {
160 int64_t size(
size_t i)
const {
162 return clist[i].size;
168 uint8_t type(
size_t i)
const {
170 return get_cl_label_type(clist[i].label);
177 uint64_t label(
size_t i)
const {
179 return clist[i].label;
188 int64_t area(
size_t i) {
191 if (clist[i].area > 0)
192 return clist[i].area;
194 uint64_t lbl = label(i);
195 uint64_t cl_area = 0;
198 if (labels[X] == lbl) {
200 if (labels[X + d] != lbl)
206 clist[i].area = cl_area;
215 std::vector<SiteIndex> sites(
size_t i)
const {
217 uint64_t la = label(i);
224 return sites.move_sites();
232 template <typename inttype, std::enable_if_t<std::is_integral<inttype>::value,
int> = 0>
236 labels[
ALL] = set_cl_label(X.coordinates(), type[X]);
244#pragma hila safe_access(labels)
246 auto type_0 = get_cl_label_type(labels[X]);
247 if (type_0 != background) {
249 auto label_1 = labels[X + d];
250 auto type_1 = get_cl_label_type(label_1);
251 if (type_0 == type_1 && labels[X] > label_1) {
262 }
while (changed.
value() > 0);
268 const auto &get_labels()
const {
278 hila::error(
"hila::clusters::classify() called without preceding "
279 "hila::clusters::make_labels()");
292 for (
int step = 1; step < nn; step *= 2) {
293 if (
myrank % (2 * step) == step) {
296 hila::send_to(
myrank - step, clist);
299 }
else if (
myrank % (2 * step) == 0 &&
myrank + step < nn) {
302 std::vector<cl_struct> clist_n, merged;
303 hila::receive_from(
myrank + step, clist_n);
306 while (i < clist.size() && n < clist_n.size()) {
307 if (clist[i].label < clist_n[n].label) {
308 merged.push_back(clist[i++]);
310 }
else if (clist[i].label == clist_n[n].label) {
311 merged.push_back(clist[i++]);
312 merged.back().size += clist_n[n++].size;
314 merged.push_back(clist_n[n++]);
318 while (i < clist.size()) {
319 merged.push_back(clist[i++]);
321 while (n < clist_n.size()) {
322 merged.push_back(clist_n[n++]);
325 clist = std::move(merged);
340#if (defined(CUDA) || defined(HIP)) && !defined(HILAPP)
342inline void clusters::make_local_clist() {
344 void *d_temp_storage =
nullptr;
345 size_t temp_storage_bytes = 0;
348 gpuMalloc(&buf, lattice->mynode.volume *
sizeof(uint64_t));
349 GPU_CHECK(gpucub::DeviceRadixSort::SortKeys(
350 d_temp_storage, temp_storage_bytes, labels.field_buffer(), buf, lattice->mynode.volume));
353 gpuMalloc(&d_temp_storage, temp_storage_bytes);
355 GPU_CHECK(gpucub::DeviceRadixSort::SortKeys(
356 d_temp_storage, temp_storage_bytes, labels.field_buffer(), buf, lattice->mynode.volume));
358 gpuFree(d_temp_storage);
361 d_temp_storage =
nullptr;
362 temp_storage_bytes = 0;
364 int64_t *d_sizes, *d_num_clust;
365 gpuMalloc(&d_labels, lattice->mynode.volume *
sizeof(uint64_t));
366 gpuMalloc(&d_sizes, (lattice->mynode.volume + 1) *
sizeof(int64_t));
367 d_num_clust = d_sizes + lattice->mynode.volume;
369 GPU_CHECK(gpucub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, buf,
370 d_labels, d_sizes, d_num_clust,
371 lattice->mynode.volume));
374 gpuMalloc(&d_temp_storage, temp_storage_bytes);
377 GPU_CHECK(gpucub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, buf,
378 d_labels, d_sizes, d_num_clust,
379 lattice->mynode.volume));
381 gpuFree(d_temp_storage);
384 gpuMemcpy(&nclusters, d_num_clust,
sizeof(int64_t), gpuMemcpyDeviceToHost);
386 std::vector<uint64_t> labels(nclusters);
387 std::vector<int64_t> sizes(nclusters);
389 gpuMemcpy(labels.data(), d_labels,
sizeof(uint64_t) * nclusters, gpuMemcpyDeviceToHost);
390 gpuMemcpy(sizes.data(), d_sizes,
sizeof(int64_t) * nclusters, gpuMemcpyDeviceToHost);
396 clist.resize(nclusters);
397 for (
int i = 0; i < nclusters; i++) {
398 clist[i] = {labels[i], sizes[i], 0};
400 if (clist.size() > 0 && get_cl_label_type(clist.back().label) == hila::clusters::background)
406inline void clusters::make_local_clist() {
410 auto *buf = lb.field_buffer();
411 std::sort(buf, buf + lattice->mynode.volume);
414 while (i < lattice->mynode.volume && get_cl_label_type(buf[i]) != background) {
417 for (++i; i < lattice->mynode.volume && buf[i] == label; ++i)
419 clist.push_back({label, i - istart, 0});
The field class implements the standard methods for accessing Fields. Hilapp replaces the parity acce...
bool is_initialized(Parity p) const
Returns true if the Field has been assigned to.
Special reduction class: enables delayed and non-blocking reductions, which are not possible with the...
const T value()
Return value of the reduction variable. Wait for the comms if needed.
Reduction & delayed(bool b=true)
deferred(bool) turns deferred on or off. By default turns on.
Running index for locating sites on the lattice.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
constexpr Parity EVEN
bit pattern: 001
constexpr unsigned NDIRS
Number of directions.
constexpr Parity ODD
bit pattern: 010
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
constexpr Parity ALL
bit pattern: 011
Implement hila::swap for gauge fields.
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are
std::ostream out
this is our default output file stream
std::ostream out0
This writes output only from main process (node 0)
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).