1#ifndef GPU_REDUCTION_H_
2#define GPU_REDUCTION_H_
734template <
class T,
typename index_type>
735__global__
void minmax_kernel_final(
const T *i_data, T *minmax_array_out,
736 index_type *coordinate_index_array,
737 index_type *coordinate_index_array_out,
738 int num_blocks,
const int sign_min_or_max) {
739 int thIdx = threadIdx.x;
740 T buffer_value, minmax_value = i_data[thIdx];
741 index_type coordinate_idx = coordinate_index_array[thIdx];
743 __shared__ T minmax_values[N_threads];
744 __shared__ index_type coordinates[N_threads];
746 for (index_type i = thIdx; i < num_blocks; i += N_threads) {
747 buffer_value = i_data[i];
748 if (minmax_value * sign_min_or_max > buffer_value * sign_min_or_max) {
749 minmax_value = buffer_value;
750 coordinate_idx = coordinate_index_array[i];
754 minmax_values[thIdx] = minmax_value;
755 coordinates[thIdx] = coordinate_idx;
758 for (
int size = blockDim.x / 2; size > 0; size /= 2) {
760 if (minmax_values[thIdx] * sign_min_or_max >
761 minmax_values[thIdx + size] * sign_min_or_max) {
762 minmax_values[thIdx] = minmax_values[thIdx + size];
763 coordinates[thIdx] = coordinates[thIdx + size];
769 minmax_array_out[blockIdx.x] = minmax_values[0];
770 coordinate_index_array_out[blockIdx.x] = coordinates[0];
774template <
class T,
typename index_type>
775__global__
void minmax_kernel(
const T *i_data, T *minmax_array_out,
776 index_type *coordinate_index_array_out,
778 const int sign_min_or_max, T
const initial_value) {
779 index_type thIdx = threadIdx.x;
780 index_type gthIdx = thIdx + blockIdx.x * N_threads + loop_info.
loop_begin;
781 const index_type grid_size = N_threads * gridDim.x;
782 T minmax_value = initial_value;
783 index_type coordinate_idx = gthIdx;
786 __shared__ T minmax_values[N_threads];
787 __shared__ index_type coordinates[N_threads];
789 for (index_type i = gthIdx; i < loop_info.loop_end ; i += grid_size) {
790 buffer_value = i_data[i];
791 if (minmax_value * sign_min_or_max > buffer_value * sign_min_or_max) {
792 minmax_value = buffer_value;
797 minmax_values[thIdx] = minmax_value;
798 coordinates[thIdx] = coordinate_idx;
801 for (index_type size = N_threads / 2; size > 0; size /= 2) {
803 if (minmax_values[thIdx] * sign_min_or_max >
804 minmax_values[thIdx + size] * sign_min_or_max) {
805 minmax_values[thIdx] = minmax_values[thIdx + size];
806 coordinates[thIdx] = coordinates[thIdx + size];
813 minmax_array_out[blockIdx.x] = minmax_values[0];
814 coordinate_index_array_out[blockIdx.x] = coordinates[0];
887std::pair<T, unsigned> gpu_launch_minmax_kernel(T *field_data,
int node_system_size,
890 using index_type =
unsigned long;
891 int num_blocks = (node_system_size + N_threads - 1) / N_threads;
893 if (num_blocks > 2048) num_blocks = 2048;
894 int block_size = N_threads;
895 int const sign_min_or_max = min_or_max ? 1 : -1;
896 T
const initial_value =
897 min_or_max ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
900 index_type *coordinate_index_array;
902 index_type coordinate_index;
908 gpuMalloc(&minmax_array,
sizeof(T) * num_blocks);
909 gpuMalloc(&coordinate_index_array,
sizeof(index_type) * num_blocks);
913 minmax_kernel<<<num_blocks, block_size>>>(field_data, minmax_array,
914 coordinate_index_array, lattice_info,
915 sign_min_or_max, initial_value);
916 check_device_error(
"minmax_kernel");
924 if (num_blocks < block_size) block_size = num_blocks;
927 minmax_kernel_final<<<1, block_size>>>(
928 minmax_array, minmax_array, coordinate_index_array, coordinate_index_array,
929 num_blocks, sign_min_or_max);
930 check_device_error(
"minmax_kernel_final");
933 gpuMemcpy(&return_value_host, minmax_array,
sizeof(T), gpuMemcpyDeviceToHost);
934 gpuMemcpy(&coordinate_index, coordinate_index_array,
sizeof(index_type),
935 gpuMemcpyDeviceToHost);
943 gpuFree(minmax_array);
944 gpuFree(coordinate_index_array);
950 return std::make_pair(return_value_host, coordinate_index);
962std::pair<T, unsigned> gpu_launch_minmax_kernel(T *field_data,
int node_system_size,
1032template <
typename T>
1035 lattice_info.
loop_begin = lattice.loop_begin(par);
1036 lattice_info.loop_end = lattice.loop_end(par);
1037 unsigned const node_system_size = lattice_info.loop_end - lattice_info.
loop_begin;
1039 std::pair<T, unsigned> value_and_coordinate =
1040 gpu_launch_minmax_kernel(this->field_buffer(), node_system_size, min_or_max, lattice_info);
1041 loc = lattice.coordinates(value_and_coordinate.second);
1042 return value_and_coordinate.first;
T gpu_minmax(bool min_or_max, Parity par, CoordinateVector &loc) const
Declare gpu_reduce here, defined only for GPU targets.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
Helper class for loading the vectorized lattice.