HILA
Loading...
Searching...
No Matches
gpu_reduction.h
1#ifndef GPU_REDUCTION_H_
2#define GPU_REDUCTION_H_
3
4// Slightly modified from the NVIDIA reduction code
5// - Make kernel number to be a fixed constant
6// - Make number of threads to be a template parameter
7// - change the main interface
8
9/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * * Neither the name of NVIDIA CORPORATION nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/*
37 Parallel reduction kernels
38*/
39
40#include "hila.h"
41
42#if !defined(HILAPP)
43
44// static constexpr int whichKernel = GPU_REDUCE_KERNEL;
45// static constexpr int numThreads = N_GPU_REDUCE_THREADS;
46
47// Define what reduction kernel to use - a local variable
48// Number from 0 to 9, can benchmark ...
49// TODO: make this makefile-define!
50
51// Utility class used to avoid linker errors with extern
52// unsized shared memory arrays with templated type
53// template <class T>
54// struct SharedMemory {
55// __device__ inline operator T *() {
56// extern __shared__ int __smem[];
57// return (T *)__smem;
58// }
59
60// __device__ inline operator const T *() const {
61// extern __shared__ int __smem[];
62// return (T *)__smem;
63// }
64// };
65
66// specialize for double to avoid unaligned memory
67// access compile errors
68// template <>
69// struct SharedMemory<double> {
70// __device__ inline operator double *() {
71// extern __shared__ double __smem_d[];
72// return (double *)__smem_d;
73// }
74
75// __device__ inline operator const double *() const {
76// extern __shared__ double __smem_d[];
77// return (double *)__smem_d;
78// }
79// };
80
81// template <class T>
82// __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
83// for (int offset = warpSize / 2; offset > 0; offset /= 2) {
84// mySum += __shfl_down_sync(mask, mySum, offset);
85// }
86// return mySum;
87// }
88
89// #if __CUDA_ARCH__ >= 800
90// // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
91// // when on SM 8.0 or higher
92// template <>
93// __device__ __forceinline__ int warpReduceSum<int>(unsigned int mask, int mySum) {
94// mySum = __reduce_add_sync(mask, mySum);
95// return mySum;
96// }
97// #endif
98
99/*
100 Parallel sum reduction using shared memory
101 - takes log(n) steps for n input elements
102 - uses n threads
103 - only works for power-of-2 arrays
104*/
105
106/* This reduction interleaves which threads are active by using the modulo
107 operator. This operator is very expensive on GPUs, and the interleaved
108 inactivity means that no whole warps are active, which is also very
109 inefficient */
110// template <class T>
111// __global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) {
112// // Handle to thread block group
113// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
114// T *sdata = SharedMemory<T>();
115
116// // load shared mem
117// unsigned int tid = threadIdx.x;
118// unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
119
120// sdata[tid] = (i < n) ? g_idata[i] : 0;
121
122// cooperative_groups::sync(cta);
123
124// // do reduction in shared mem
125// for (unsigned int s = 1; s < blockDim.x; s *= 2) {
126// // modulo arithmetic is slow!
127// if ((tid % (2 * s)) == 0) {
128// sdata[tid] += sdata[tid + s];
129// }
130
131// cooperative_groups::sync(cta);
132// }
133
134// // write result for this block to global mem
135// if (tid == 0)
136// g_odata[blockIdx.x] = sdata[0];
137// }
138
139// /* This version uses contiguous threads, but its interleaved
140// addressing results in many shared memory bank conflicts.
141// */
142// template <class T>
143// __global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
144// // Handle to thread block group
145// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
146// T *sdata = SharedMemory<T>();
147
148// // load shared mem
149// unsigned int tid = threadIdx.x;
150// unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
151
152// sdata[tid] = (i < n) ? g_idata[i] : 0;
153
154// cooperative_groups::sync(cta);
155
156// // do reduction in shared mem
157// for (unsigned int s = 1; s < blockDim.x; s *= 2) {
158// int index = 2 * s * tid;
159
160// if (index < blockDim.x) {
161// sdata[index] += sdata[index + s];
162// }
163
164// cooperative_groups::sync(cta);
165// }
166
167// // write result for this block to global mem
168// if (tid == 0)
169// g_odata[blockIdx.x] = sdata[0];
170// }
171
172// /*
173// This version uses sequential addressing -- no divergence or bank conflicts.
174// */
175// template <class T>
176// __global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
177// // Handle to thread block group
178// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
179// T *sdata = SharedMemory<T>();
180
181// // load shared mem
182// unsigned int tid = threadIdx.x;
183// unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
184
185// sdata[tid] = (i < n) ? g_idata[i] : 0;
186
187// cooperative_groups::sync(cta);
188
189// // do reduction in shared mem
190// for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
191// if (tid < s) {
192// sdata[tid] += sdata[tid + s];
193// }
194
195// cooperative_groups::sync(cta);
196// }
197
198// // write result for this block to global mem
199// if (tid == 0)
200// g_odata[blockIdx.x] = sdata[0];
201// }
202
203// /*
204// This version uses n/2 threads --
205// it performs the first level of reduction when reading from global memory.
206// */
207// template <class T>
208// __global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) {
209// // Handle to thread block group
210// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
211// T *sdata = SharedMemory<T>();
212
213// // perform first level of reduction,
214// // reading from global memory, writing to shared memory
215// unsigned int tid = threadIdx.x;
216// unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
217
218// T mySum = (i < n) ? g_idata[i] : 0;
219
220// if (i + blockDim.x < n)
221// mySum += g_idata[i + blockDim.x];
222
223// sdata[tid] = mySum;
224// cooperative_groups::sync(cta);
225
226// // do reduction in shared mem
227// for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
228// if (tid < s) {
229// sdata[tid] = mySum = mySum + sdata[tid + s];
230// }
231
232// cooperative_groups::sync(cta);
233// }
234
235// // write result for this block to global mem
236// if (tid == 0)
237// g_odata[blockIdx.x] = mySum;
238// }
239
240// /*
241// This version uses the warp shuffle operation if available to reduce
242// warp synchronization. When shuffle is not available the final warp's
243// worth of work is unrolled to reduce looping overhead.
244
245// See
246// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
247// for additional information about using shuffle to perform a reduction
248// within a warp.
249
250// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
251// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
252// If blockSize > 32, allocate blockSize*sizeof(T) bytes.
253// */
254// template <class T, unsigned int blockSize>
255// __global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) {
256// // Handle to thread block group
257// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
258// T *sdata = SharedMemory<T>();
259
260// // perform first level of reduction,
261// // reading from global memory, writing to shared memory
262// unsigned int tid = threadIdx.x;
263// unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
264
265// T mySum = (i < n) ? g_idata[i] : 0;
266
267// if (i + blockSize < n)
268// mySum += g_idata[i + blockSize];
269
270// sdata[tid] = mySum;
271// cooperative_groups::sync(cta);
272
273// // do reduction in shared mem
274// for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
275// if (tid < s) {
276// sdata[tid] = mySum = mySum + sdata[tid + s];
277// }
278
279// cooperative_groups::sync(cta);
280// }
281
282// cooperative_groups::thread_block_tile<32> tile32 =
283// cooperative_groups::tiled_partition<32>(cta);
284
285// if (cta.thread_rank() < 32) {
286// // Fetch final intermediate sum from 2nd warp
287// if (blockSize >= 64)
288// mySum += sdata[tid + 32];
289// // Reduce final warp using shuffle
290// for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
291// mySum += tile32.shfl_down(mySum, offset);
292// }
293// }
294
295// // write result for this block to global mem
296// if (cta.thread_rank() == 0)
297// g_odata[blockIdx.x] = mySum;
298// }
299
300// /*
301// This version is completely unrolled, unless warp shuffle is available, then
302// shuffle is used within a loop. It uses a template parameter to achieve
303// optimal code for any (power of 2) number of threads. This requires a switch
304// statement in the host code to handle all the different thread block sizes at
305// compile time. When shuffle is available, it is used to reduce warp
306// synchronization.
307
308// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
309// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
310// If blockSize > 32, allocate blockSize*sizeof(T) bytes.
311// */
312// template <class T, unsigned int blockSize>
313// __global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) {
314// // Handle to thread block group
315// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
316// T *sdata = SharedMemory<T>();
317
318// // perform first level of reduction,
319// // reading from global memory, writing to shared memory
320// unsigned int tid = threadIdx.x;
321// unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
322
323// T mySum = (i < n) ? g_idata[i] : 0;
324
325// if (i + blockSize < n)
326// mySum += g_idata[i + blockSize];
327
328// sdata[tid] = mySum;
329// cooperative_groups::sync(cta);
330
331// // do reduction in shared mem
332// if ((blockSize >= 512) && (tid < 256)) {
333// sdata[tid] = mySum = mySum + sdata[tid + 256];
334// }
335
336// cooperative_groups::sync(cta);
337
338// if ((blockSize >= 256) && (tid < 128)) {
339// sdata[tid] = mySum = mySum + sdata[tid + 128];
340// }
341
342// cooperative_groups::sync(cta);
343
344// if ((blockSize >= 128) && (tid < 64)) {
345// sdata[tid] = mySum = mySum + sdata[tid + 64];
346// }
347
348// cooperative_groups::sync(cta);
349
350// cooperative_groups::thread_block_tile<32> tile32 =
351// cooperative_groups::tiled_partition<32>(cta);
352
353// if (cta.thread_rank() < 32) {
354// // Fetch final intermediate sum from 2nd warp
355// if (blockSize >= 64)
356// mySum += sdata[tid + 32];
357// // Reduce final warp using shuffle
358// for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
359// mySum += tile32.shfl_down(mySum, offset);
360// }
361// }
362
363// // write result for this block to global mem
364// if (cta.thread_rank() == 0)
365// g_odata[blockIdx.x] = mySum;
366// }
367
368// /*
369// This version adds multiple elements per thread sequentially. This reduces
370// the overall cost of the algorithm while keeping the work complexity O(n) and
371// the step complexity O(log n). (Brent's Theorem optimization)
372
373// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
374// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
375// If blockSize > 32, allocate blockSize*sizeof(T) bytes.
376// */
377// template <class T, unsigned int blockSize, bool nIsPow2>
378// __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
379// // Handle to thread block group
380// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
381// T *sdata = SharedMemory<T>();
382
383// // perform first level of reduction,
384// // reading from global memory, writing to shared memory
385// unsigned int tid = threadIdx.x;
386// unsigned int gridSize = blockSize * gridDim.x;
387
388// T mySum = 0;
389
390// // we reduce multiple elements per thread. The number is determined by the
391// // number of active thread blocks (via gridDim). More blocks will result
392// // in a larger gridSize and therefore fewer elements per thread
393// if (nIsPow2) {
394// unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
395// gridSize = gridSize << 1;
396
397// while (i < n) {
398// mySum += g_idata[i];
399// // ensure we don't read out of bounds -- this is optimized away for
400// // powerOf2 sized arrays
401// if ((i + blockSize) < n) {
402// mySum += g_idata[i + blockSize];
403// }
404// i += gridSize;
405// }
406// } else {
407// unsigned int i = blockIdx.x * blockSize + threadIdx.x;
408// while (i < n) {
409// mySum += g_idata[i];
410// i += gridSize;
411// }
412// }
413
414// // each thread puts its local sum into shared memory
415// sdata[tid] = mySum;
416// cooperative_groups::sync(cta);
417
418// // do reduction in shared mem
419// if ((blockSize >= 512) && (tid < 256)) {
420// sdata[tid] = mySum = mySum + sdata[tid + 256];
421// }
422
423// cooperative_groups::sync(cta);
424
425// if ((blockSize >= 256) && (tid < 128)) {
426// sdata[tid] = mySum = mySum + sdata[tid + 128];
427// }
428
429// cooperative_groups::sync(cta);
430
431// if ((blockSize >= 128) && (tid < 64)) {
432// sdata[tid] = mySum = mySum + sdata[tid + 64];
433// }
434
435// cooperative_groups::sync(cta);
436
437// cooperative_groups::thread_block_tile<32> tile32 =
438// cooperative_groups::tiled_partition<32>(cta);
439
440// if (cta.thread_rank() < 32) {
441// // Fetch final intermediate sum from 2nd warp
442// if (blockSize >= 64)
443// mySum += sdata[tid + 32];
444// // Reduce final warp using shuffle
445// for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
446// mySum += tile32.shfl_down(mySum, offset);
447// }
448// }
449
450// // write result for this block to global mem
451// if (cta.thread_rank() == 0)
452// g_odata[blockIdx.x] = mySum;
453// }
454
455// /*
456// Kernel 7
457// */
458
459// template <typename T, unsigned int blockSize, bool nIsPow2>
460// __global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata,
461// unsigned int n) {
462// T *sdata = SharedMemory<T>();
463
464// // perform first level of reduction,
465// // reading from global memory, writing to shared memory
466// unsigned int tid = threadIdx.x;
467// unsigned int gridSize = blockSize * gridDim.x;
468// unsigned int maskLength = (blockSize & 31); // 31 = warpSize-1
469// maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
470// const unsigned int mask = (0xffffffff) >> maskLength;
471
472// T mySum = 0;
473
474// // we reduce multiple elements per thread. The number is determined by the
475// // number of active thread blocks (via gridDim). More blocks will result
476// // in a larger gridSize and therefore fewer elements per thread
477// if (nIsPow2) {
478// unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
479// gridSize = gridSize << 1;
480
481// while (i < n) {
482// mySum += g_idata[i];
483// // ensure we don't read out of bounds -- this is optimized away for
484// // powerOf2 sized arrays
485// if ((i + blockSize) < n) {
486// mySum += g_idata[i + blockSize];
487// }
488// i += gridSize;
489// }
490// } else {
491// unsigned int i = blockIdx.x * blockSize + threadIdx.x;
492// while (i < n) {
493// mySum += g_idata[i];
494// i += gridSize;
495// }
496// }
497
498// // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
499// // SM 8.0
500// mySum = warpReduceSum<T>(mask, mySum);
501
502// // each thread puts its local sum into shared memory
503// if ((tid % warpSize) == 0) {
504// sdata[tid / warpSize] = mySum;
505// }
506
507// __syncthreads();
508
509// const unsigned int shmem_extent =
510// (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1;
511// const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent);
512// if (tid < shmem_extent) {
513// mySum = sdata[tid];
514// // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
515// // SM 8.0
516// mySum = warpReduceSum<T>(ballot_result, mySum);
517// }
518
519// // write result for this block to global mem
520// if (tid == 0) {
521// g_odata[blockIdx.x] = mySum;
522// }
523// }
524
525// /*
526// Kernel 8 gc_reduce
527// */
528
529// // Performs a reduction step and updates numTotal with how many are remaining
530// template <typename T, typename Group>
531// __device__ T cg_reduce_n(T in, Group &threads) {
532// return cooperative_groups::reduce(threads, in, cooperative_groups::plus<T>());
533// }
534
535// template <class T>
536// __global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
537// // Shared memory for intermediate steps
538// T *sdata = SharedMemory<T>();
539// // Handle to thread block group
540// cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
541// // Handle to tile in thread block
542// cooperative_groups::thread_block_tile<32> tile =
543// cooperative_groups::tiled_partition<32>(cta);
544
545// unsigned int ctaSize = cta.size();
546// unsigned int numCtas = gridDim.x;
547// unsigned int threadRank = cta.thread_rank();
548// unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank;
549
550// T threadVal = 0;
551// {
552// unsigned int i = threadIndex;
553// unsigned int indexStride = (numCtas * ctaSize);
554// while (i < n) {
555// threadVal += g_idata[i];
556// i += indexStride;
557// }
558// sdata[threadRank] = threadVal;
559// }
560
561// // Wait for all tiles to finish and reduce within CTA
562// {
563// unsigned int ctaSteps = tile.meta_group_size();
564// unsigned int ctaIndex = ctaSize >> 1;
565// while (ctaIndex >= 32) {
566// cta.sync();
567// if (threadRank < ctaIndex) {
568// threadVal += sdata[threadRank + ctaIndex];
569// sdata[threadRank] = threadVal;
570// }
571// ctaSteps >>= 1;
572// ctaIndex >>= 1;
573// }
574// }
575
576// // Shuffle redux instead of smem redux
577// {
578// cta.sync();
579// if (tile.meta_group_rank() == 0) {
580// threadVal = cg_reduce_n(threadVal, tile);
581// }
582// }
583
584// if (threadRank == 0)
585// g_odata[blockIdx.x] = threadVal;
586// }
587
588// /*
589// Kernel 9
590// */
591
592// template <class T, size_t BlockSize, size_t MultiWarpGroupSize>
593// __global__ void multi_warp_cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
594// // Shared memory for intermediate steps
595// T *sdata = SharedMemory<T>();
596// __shared__ cooperative_groups::experimental::block_tile_memory<sizeof(T), BlockSize>
597// scratch;
598
599// // Handle to thread block group
600// auto cta = cooperative_groups::experimental::this_thread_block(scratch);
601// // Handle to multiWarpTile in thread block
602// auto multiWarpTile =
603// cooperative_groups::experimental::tiled_partition<MultiWarpGroupSize>(cta);
604
605// unsigned int gridSize = BlockSize * gridDim.x;
606// T threadVal = 0;
607
608// // we reduce multiple elements per thread. The number is determined by the
609// // number of active thread blocks (via gridDim). More blocks will result
610// // in a larger gridSize and therefore fewer elements per thread
611// int nIsPow2 = !(n & n - 1);
612// if (nIsPow2) {
613// unsigned int i = blockIdx.x * BlockSize * 2 + threadIdx.x;
614// gridSize = gridSize << 1;
615
616// while (i < n) {
617// threadVal += g_idata[i];
618// // ensure we don't read out of bounds -- this is optimized away for
619// // powerOf2 sized arrays
620// if ((i + BlockSize) < n) {
621// threadVal += g_idata[i + blockDim.x];
622// }
623// i += gridSize;
624// }
625// } else {
626// unsigned int i = blockIdx.x * BlockSize + threadIdx.x;
627// while (i < n) {
628// threadVal += g_idata[i];
629// i += gridSize;
630// }
631// }
632// threadVal = cg_reduce_n(threadVal, multiWarpTile);
633
634// if (multiWarpTile.thread_rank() == 0) {
635// sdata[multiWarpTile.meta_group_rank()] = threadVal;
636// }
637// cooperative_groups::sync(cta);
638
639// if (threadIdx.x == 0) {
640// threadVal = 0;
641// for (int i = 0; i < multiWarpTile.meta_group_size(); i++) {
642// threadVal += sdata[i];
643// }
644// g_odata[blockIdx.x] = threadVal;
645// }
646// }
647
648////////////////////////////////////////////////////////////////////////////////
649
650// inline bool isPow2(unsigned int x) {
651// return ((x & (x - 1)) == 0);
652// }
653
654////////////////////////////////////////////////////////////////////////////////
655// Wrapper function for kernel launch
656// Now make kernel number a template parameter, also number of threads
657////////////////////////////////////////////////////////////////////////////////
658
659// template <class T, int threads>
660// void reduce_kernel(int size, int blocks, T *d_idata, T *d_odata) {
661// dim3 dimBlock(threads, 1, 1);
662// dim3 dimGrid(blocks, 1, 1);
663
664// // when there is only one warp per block, we need to allocate two warps
665// // worth of shared memory so that we don't index shared memory out of bounds
666// int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
667
668// // choose which of the optimized versions of reduction to launch
669// switch (whichKernel) {
670// case 0:
671// reduce0<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
672// break;
673
674// case 1:
675// reduce1<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
676// break;
677
678// case 2:
679// reduce2<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
680// break;
681
682// case 3:
683// reduce3<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
684// break;
685
686// case 4:
687// reduce4<T, threads><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
688// break;
689
690// case 5:
691// reduce5<T, threads><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
692// break;
693
694// case 6:
695// if (isPow2(size)) {
696// reduce6<T, threads, true>
697// <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
698// } else {
699// reduce6<T, threads, false>
700// <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
701// }
702
703// break;
704
705// case 7:
706// // For reduce7 kernel we require only blockSize/warpSize
707// // number of elements in shared memory
708// smemSize = ((threads / 32) + 1) * sizeof(T);
709// if (isPow2(size)) {
710// reduce7<T, threads, true>
711// <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
712// } else {
713// reduce7<T, threads, false>
714// <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
715// }
716// break;
717
718// case 8:
719// cg_reduce<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
720// break;
721
722// case 9:
723// constexpr int numOfMultiWarpGroups = 2;
724// smemSize = numOfMultiWarpGroups * sizeof(T);
725
726// static_assert(threads >= 64,
727// "thread block size of < 64 is not supported for this kernel");
728// multi_warp_cg_reduce<T, threads, threads / numOfMultiWarpGroups>
729// <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
730// break;
731// }
732// }
733
734template <class T, typename index_type>
735__global__ void minmax_kernel_final(const T *i_data, T *minmax_array_out,
736 index_type *coordinate_index_array,
737 index_type *coordinate_index_array_out,
738 int num_blocks, const int sign_min_or_max) {
739 int thIdx = threadIdx.x;
740 T buffer_value, minmax_value = i_data[thIdx];
741 index_type coordinate_idx = coordinate_index_array[thIdx];
742
743 __shared__ T minmax_values[N_threads];
744 __shared__ index_type coordinates[N_threads];
745
746 for (index_type i = thIdx; i < num_blocks; i += N_threads) {
747 buffer_value = i_data[i];
748 if (minmax_value * sign_min_or_max > buffer_value * sign_min_or_max) {
749 minmax_value = buffer_value;
750 coordinate_idx = coordinate_index_array[i];
751 }
752 }
753
754 minmax_values[thIdx] = minmax_value;
755 coordinates[thIdx] = coordinate_idx;
756 __syncthreads();
757
758 for (int size = blockDim.x / 2; size > 0; size /= 2) {
759 if (thIdx < size) {
760 if (minmax_values[thIdx] * sign_min_or_max >
761 minmax_values[thIdx + size] * sign_min_or_max) {
762 minmax_values[thIdx] = minmax_values[thIdx + size];
763 coordinates[thIdx] = coordinates[thIdx + size];
764 }
765 __syncthreads();
766 }
767 }
768 if (thIdx == 0) {
769 minmax_array_out[blockIdx.x] = minmax_values[0];
770 coordinate_index_array_out[blockIdx.x] = coordinates[0];
771 }
772}
773
774template <class T, typename index_type>
775__global__ void minmax_kernel(const T *i_data, T *minmax_array_out,
776 index_type *coordinate_index_array_out,
777 backend_lattice_struct loop_info,
778 const int sign_min_or_max, T const initial_value) {
779 index_type thIdx = threadIdx.x;
780 index_type gthIdx = thIdx + blockIdx.x * N_threads + loop_info.loop_begin;
781 const index_type grid_size = N_threads * gridDim.x;
782 T minmax_value = initial_value;
783 index_type coordinate_idx = gthIdx;
784 T buffer_value;
785
786 __shared__ T minmax_values[N_threads];
787 __shared__ index_type coordinates[N_threads];
788
789 for (index_type i = gthIdx; i < loop_info.loop_end ; i += grid_size) {
790 buffer_value = i_data[i];
791 if (minmax_value * sign_min_or_max > buffer_value * sign_min_or_max) {
792 minmax_value = buffer_value;
793 coordinate_idx = i;
794 }
795 }
796
797 minmax_values[thIdx] = minmax_value;
798 coordinates[thIdx] = coordinate_idx;
799 __syncthreads();
800
801 for (index_type size = N_threads / 2; size > 0; size /= 2) {
802 if (thIdx < size) {
803 if (minmax_values[thIdx] * sign_min_or_max >
804 minmax_values[thIdx + size] * sign_min_or_max) {
805 minmax_values[thIdx] = minmax_values[thIdx + size];
806 coordinates[thIdx] = coordinates[thIdx + size];
807 }
808
809 }
810 __syncthreads();
811 }
812 if (thIdx == 0) {
813 minmax_array_out[blockIdx.x] = minmax_values[0];
814 coordinate_index_array_out[blockIdx.x] = coordinates[0];
815 }
816}
817
818////////////////////////////////////////////////////////////////////////////////
819
820////////////////////////////////////////////////////////////////////////////////
821// Compute the number of (threads and) blocks to use for the given reduction
822// kernel For the kernels >= 3, we set threads / block to the minimum of
823// maxThreads and n/2. For kernels < 3, we set to the minimum of maxThreads and
824// n. For kernel 6, we observe the maximum specified number of blocks, because
825// each thread in that kernel can process a variable number of elements.
826////////////////////////////////////////////////////////////////////////////////
827
828////////////////////////////////////////////////////////////////////////////////
829// This function performs a reduction of the input data
830// d_odata is the
831////////////////////////////////////////////////////////////////////////////////
832// template <class T>
833// T gpu_reduce(int size, T *d_idata, bool keep_buffers) {
834
835// // Reasonable defaults? TODO: make adjustable
836// int maxBlocks = 64; // only for kernels >= 6
837
838// int numBlocks = 0;
839
840// if (whichKernel < 3) {
841// numBlocks = (size + numThreads - 1) / numThreads;
842// } else {
843// numBlocks = (size + (numThreads * 2 - 1)) / (numThreads * 2);
844// if (whichKernel >= 6) {
845// if (numBlocks < maxBlocks)
846// numBlocks = maxBlocks;
847// }
848// }
849
850// static T *d_odata = nullptr;
851// static T *h_odata = nullptr;
852
853// // allocate buffers if not done before
854// if (d_odata == nullptr) {
855// gpuMalloc(&d_odata, numBlocks * sizeof(T));
856// }
857// if (h_odata == nullptr) {
858// h_odata = (T *)memalloc(numBlocks * sizeof(T));
859// }
860
861// // execute the kernel
862// reduce_kernel<T, numThreads>(size, numBlocks, d_idata, d_odata);
863// check_device_error("reduction");
864
865// // sum partial sums from each block on CPU
866// // copy result from device to host
867// gpuMemcpy(h_odata, d_odata, numBlocks * sizeof(T), gpuMemcpyDeviceToHost);
868
869// T gpu_result = 0;
870// for (int i = 0; i < numBlocks; i++) {
871// gpu_result += h_odata[i];
872// }
873
874// // release buffers if not needed again
875// if (!keep_buffers) {
876// free(h_odata);
877// gpuFree(d_odata);
878// h_odata = nullptr;
879// d_odata = nullptr;
880// }
881
882// return gpu_result;
883// }
884
885// All the for testing parts allow to see what happens in the return arrays between the singular block reduction
886template <class T>
887std::pair<T, unsigned> gpu_launch_minmax_kernel(T *field_data, int node_system_size,
888 bool min_or_max, backend_lattice_struct lattice_info) {
889
890 using index_type = unsigned long;
891 int num_blocks = (node_system_size + N_threads - 1) / N_threads;
892 //CUDA will take a performance hit if num_blocks is too large at very large system sizes
893 if (num_blocks > 2048) num_blocks = 2048;
894 int block_size = N_threads;
895 int const sign_min_or_max = min_or_max ? 1 : -1;
896 T const initial_value =
897 min_or_max ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
898
899 T *minmax_array;
900 index_type *coordinate_index_array;
901 T return_value_host;
902 index_type coordinate_index;
903
904 // for testing
905 // T *return_value_list = new T[num_blocks];
906 // index_type *coordinate_list = new index_type[num_blocks];
907
908 gpuMalloc(&minmax_array, sizeof(T) * num_blocks);
909 gpuMalloc(&coordinate_index_array, sizeof(index_type) * num_blocks);
910
911 // Find num_blocks amount of max or min values
912 //implement loop
913 minmax_kernel<<<num_blocks, block_size>>>(field_data, minmax_array,
914 coordinate_index_array, lattice_info,
915 sign_min_or_max, initial_value);
916 check_device_error("minmax_kernel");
917 // For testing
918 // cudaMemcpy(return_value_list, minmax_array, sizeof(T) * num_blocks,
919 // cudaMemcpyDeviceToHost);
920 // cudaMemcpy(coordinate_list, coordinate_index_array, sizeof(index_type) * num_blocks,
921 // cudaMemcpyDeviceToHost);
922
923 // If we happen to have less than N_threads worth of blocks, then the final kernel will be launched with num_blocks worth of threads
924 if (num_blocks < block_size) block_size = num_blocks;
925
926 // Find global max or min from num_blocks amount of values
927 minmax_kernel_final<<<1, block_size>>>(
928 minmax_array, minmax_array, coordinate_index_array, coordinate_index_array,
929 num_blocks, sign_min_or_max);
930 check_device_error("minmax_kernel_final");
931
932 // Location and value of max or minP will be the first element of return arrays
933 gpuMemcpy(&return_value_host, minmax_array, sizeof(T), gpuMemcpyDeviceToHost);
934 gpuMemcpy(&coordinate_index, coordinate_index_array, sizeof(index_type),
935 gpuMemcpyDeviceToHost);
936
937 // for testin
938 // cudaMemcpy(return_value_list, minmax_array, sizeof(T) * num_blocks,
939 // cudaMemcpyDeviceToHost);
940 // cudaMemcpy(coordinate_list, coordinate_index_array, sizeof(index_type) * num_blocks,
941 // cudaMemcpyDeviceToHost);
942
943 gpuFree(minmax_array);
944 gpuFree(coordinate_index_array);
945 // for testing
946 // for (auto i = 0; i < 1; i++) {
947 // std::cout << return_value_list[0] << ": value, " << coordinate_list[0]
948 // << ": index, \n";
949 // }
950 return std::make_pair(return_value_host, coordinate_index);
951}
952
953/////////////////////////////////////////////////////////////////////////////////////////
954
955#else // if !defined(HILAPP)
956
957// just declare the name
958// template <class T>
959// T gpu_reduce(int size, T *d_idata, bool keep_buffers);
960
961template <class T>
962std::pair<T, unsigned> gpu_launch_minmax_kernel(T *field_data, int node_system_size,
963 bool min_or_max, backend_lattice_struct lattice_info);
964
965#endif // ifndef HILAPP
966
967#include "com_mpi.h"
968
969/////////////////////////////////////////////////////////////////////////////////////////
970// Reduce field var over the lattice
971
972// template <typename T>
973// T Field<T>::gpu_reduce_sum(bool allreduce, Parity par, bool do_mpi) const {
974
975// #ifndef EVEN_SITES_FIRST
976// assert(par == Parity::all &&
977// "EVEN_SITES_FIRST neede for gpu reduction with parity");
978// #endif
979
980// using base_t = hila::arithmetic_type<T>;
981// constexpr int n = sizeof(T) / sizeof(base_t);
982
983// // fields are stored 1 after another on gpu fields
984// // -- this really relies on the data layout!
985// T *fb = this->field_buffer();
986// // address data with bptr
987// base_t *bptr = (base_t *)(fb);
988// const lattice_struct *lat = this->fs->lattice;
989
990// // use this union to set the value element by element
991// union {
992// T value;
993// base_t element[n];
994// } result;
995
996// unsigned fsize = lat->field_alloc_size();
997// unsigned size = lat->mynode.volume();
998
999// if (par == Parity::even) {
1000// size = lat->mynode.evensites;
1001// } else if (par == Parity::odd) {
1002// size = lat->mynode.oddsites;
1003// bptr += lat->mynode.evensites;
1004// }
1005
1006// for (int i = 0; i < n; i++) {
1007// // keep buffers until last element
1008// result.element[i] = gpu_reduce<base_t>(size, bptr, i < (n - 1));
1009// bptr += fsize;
1010// }
1011
1012// // we don't always do MPI - not in generated loops
1013// if (do_mpi) {
1014// MPI_Datatype dtype;
1015// int s;
1016
1017// dtype = _type<T>(s);
1018// assert(dtype != MPI_BYTE && "Unknown number_type in gpu_reduce_sum");
1019
1020// if (allreduce) {
1021// MPI_Allreduce(MPI_IN_PLACE, &result.value, size, dtype, MPI_SUM,
1022// this->fs->lattice.mpi_comm_lat);
1023// } else {
1024// MPI_Reduce(MPI_IN_PLACE, &result.value, size, dtype, MPI_SUM, 0,
1025// this->fs->lattice.mpi_comm_lat);
1026// }
1027// }
1028
1029// return result.value;
1030// }
1031
1032template <typename T>
1033T Field<T>::gpu_minmax(bool min_or_max, Parity par, CoordinateVector &loc) const {
1034 backend_lattice_struct lattice_info = *(lattice.backend_lattice);
1035 lattice_info.loop_begin = lattice.loop_begin(par);
1036 lattice_info.loop_end = lattice.loop_end(par);
1037 unsigned const node_system_size = lattice_info.loop_end - lattice_info.loop_begin;
1038
1039 std::pair<T, unsigned> value_and_coordinate =
1040 gpu_launch_minmax_kernel(this->field_buffer(), node_system_size, min_or_max, lattice_info);
1041 loc = lattice.coordinates(value_and_coordinate.second);
1042 return value_and_coordinate.first;
1043}
1044
1045#endif
T gpu_minmax(bool min_or_max, Parity par, CoordinateVector &loc) const
Declare gpu_reduce here, defined only for GPU targets.
Parity
Parity enum with values EVEN, ODD, ALL; refers to parity of the site. Parity of site (x,...
#define N_threads
General number of threads in a thread block.
Definition params.h:189
Helper class for loading the vectorized lattice.