HILA
Loading...
Searching...
No Matches
params.h
Go to the documentation of this file.
1#ifndef HILA_PARAMS_H_
2#define HILA_PARAMS_H_
3/**
4 * @file params.h
5 * @brief This file contains #defined constants
6 * @details
7 * These can be overruled in application Makefile, with APP_OPTS := -DPARAMETER=value.
8 *
9 * There are two types of #define variables, True/False switches or parameter variables.
10 *
11 * True/False statements can be set with either 0 (False) or 1 (True) as -DPARAMETER=0.
12 *
13 * Parameter variables are set similary with -DPARAMETER=var where var is the chosen variable
14 */
15
16#ifdef RELEASE
17/**
18 * @brief Turn off asserts which are on by default.
19 * @details By defining either RELEASE or NDEBUG (No debug) asserts will be turned off.
20 * Static asserts naturally remain active
21 */
22#ifndef NDEBUG
23#define NDEBUG
24#endif
25#endif
26
27#ifndef NDIM
28/**
29 * @brief HILA system dimensionality
30 * @details Set's HILA dimensionality for which 4 is default. Options are 2,3,4
31 */
32#define NDIM 4
33#endif
34
35#ifndef DEFAULT_OUTPUT_NAME
36/**
37 * @def DEFAULT_OUTPUT_NAME
38 * @brief Default output file name
39 */
40#define DEFAULT_OUTPUT_NAME "output"
41#endif
42
43#ifndef EVEN_SITES_FIRST
44/**
45 * @brief EVEN_SITES_FIRST is default. To traverse sites on natural (parity invariant) order use
46 * -DEVEN_SITES_FIRST=0
47 */
48#define EVEN_SITES_FIRST
49#elif EVEN_SITES_FIRST == 0
50#undef EVEN_SITES_FIRST
51#endif
52
53/// NODE_LAYOUT_TRIVIAL or NODE_LAYOUT_BLOCK determine how MPI ranks are laid out on logical
54/// lattice. TRIVIAL lays out the lattice on logical order where x-direction runs fastest etc.
55/// if NODE_LAYOUT_BLOCK is defined, NODE_LAYOUT_BLOCK consecutive MPI ranks are laid out so that
56/// these form a compact "block" of ranks logically close togeter.
57/// Define NODE_LAYOUT_BLOCK to be the number of
58/// MPI processes within one compute node - tries to maximize the use of fast local communications.
59/// Either one of these must be defined.
60
61#ifndef NODE_LAYOUT_TRIVIAL
62#ifndef NODE_LAYOUT_BLOCK
63#define NODE_LAYOUT_BLOCK 4
64#endif
65#endif
66
67/// WRITE_BUFFER SIZE
68/// Size of the write buffer in field writes, in bytes
69/// Larger buffer -> less MPI calls in writing, but more memory
70#ifndef WRITE_BUFFER_SIZE
71#define WRITE_BUFFER_SIZE 2000000
72#endif
73
74
75// boundary conditions are "off" by default -- no need to do anything here
76// #ifndef SPECIAL_BOUNDARY_CONDITIONS
77
78///////////////////////////////////////////////////////////////////////////
79// Special defines for GPU targets
80#if defined(CUDA) || defined(HIP)
81
82/// Use gpu memory pool by default
83/// turn off by using -DGPU_MEMORY_POOL=0 in Makefile
84#ifndef GPU_MEMORY_POOL
85#define GPU_MEMORY_POOL
86#elif GPU_MEMORY_POOL == 0
87#undef GPU_MEMORY_POOL
88#endif
89
90/// GPU_AWARE_MPI
91/// By default GPU aware MPI is on. Turn it off in Makefile with -DGPU_AWARE_MPI=0
92#ifndef GPU_AWARE_MPI
93#define GPU_AWARE_MPI 1
94#elif GPU_AWARE_MPI == 0
95#undef GPU_AWARE_MPI
96#endif
97
98/// GPU_RNG_THREAD_BLOCKS
99/// Number of thread blocks (of N_threads threads) to use in onsites()-loops containing random
100/// numbers. GPU_RNG_THREAD_BLOCKS=0 or undefined means use one RNG on each lattice site, and the
101/// thread block number is not restricted. RNG takes about 48 B/generator (with XORWOW). When
102/// GPU_RNG_THREAD_BLOCKS > 0 only (N_threads * GPU_RNG_THREAD_BLOCKS) generators are in use, which
103/// reduces the memory footprint substantially (and bandwidth demand) Too small number slows down
104/// onsites()-loops containing RNGs, because less threads are active. Example:
105/// Field<Vector<4,double>> vfield;
106/// onsites(ALL) {
107/// vfield[X].gaussian_random(); // there's RNG here, so this onsites() is handled by
108/// // GPU_RNG_THREAD_BLOCKS thread blocks
109/// }
110/// GPU_RNG_THREAD_BLOCKS<0 disables GPU random numbers entirely, and loops like above will crash if
111/// executed. hilapp will emit a warning, but program is compiled
112///
113/// Default: 32 seems to be OK compromise. Can be set to 0 if memory is not a problem.
114
115#ifndef GPU_RNG_THREAD_BLOCKS
116#define GPU_RNG_THREAD_BLOCKS 32
117#endif
118
119/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS
120/// # of thread blocks (of N_threads threads) used in ReductionVector (weighted histogram) ops.
121/// A value > 0 for GPU_VECTOR_REDUCTION_THREAD_BLOCKS means that the onsites-loop where the
122/// reduction is done is handled by GPU_VECTOR_REDUCTION_THREAD_BLOCKS thread blocks of N_threads
123/// threads. Each thread handles its own histogram, thus there are
124/// (GPU_VECTOR_REDUCTION_THREAD_BLOCKS*N_threads) working copies of the histogram which are then
125/// combined. Too small value slows the loop where this happens computation, too large uses
126/// (temporarily) more memory. Example:
127/// ReductionVector<double> rv(100);
128/// Field<int> index;
129/// ... (set index to values 0 .. 99)
130/// onsites(ALL) {
131/// rv[index[X]] += ..
132/// ..
133/// }
134///
135/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 or undefined means that the thread block number is not
136/// restricted and only a single histogram is used with atomic operations (atomicAdd). This
137/// can be slower, but the performance is GPU hardware/driver dependent. In some
138/// cases GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 turns out to be faster.
139///
140/// Default: 32 is currently OK compromise (32 thread blocks)
141
142#ifndef GPU_VECTOR_REDUCTION_THREAD_BLOCKS
143#define GPU_VECTOR_REDUCTION_THREAD_BLOCKS 32
144#endif
145
146
147/// GPU_VECTOR_REDUCTION_THREADS defines max threads per block for block reduction
148/// ReductionVector uses cub::blockreduce with this many threads per block. Too large
149/// value can exhaust the resources on GPUs, which will give runtime error.
150
151#ifndef GPU_BLOCK_REDUCTION_THREADS
152#define GPU_BLOCK_REDUCTION_THREADS 128
153#endif
154
155
156/// GPU_VECTOR_REDUCTION_SIZE_THRESHOLD is an optimization parameter. If reduction size
157/// is large than threshold, we use "single pass" kernel; if smaller, hierarchial
158/// kernel launch. Both use the same amount of memory. The algorithms become
159/// correspondingly better at extreme ends, but around 500-1000 there is
160/// a slow changeover, depending on computing hardware.
161/// NOT USED IN PRESENT ReductionVector implementation
162
163// #ifndef GPU_VECTOR_REDUCTION_SIZE_THRESHOLD
164// #define GPU_VECTOR_REDUCTION_SIZE_THRESHOLD 700
165// #endif
166
167
168/// GPUFFT_BATCH_SIZE
169/// How many complex fft's in parallel - large value can be faster, small uses less memory.
170/// Performance is reduced if the value is too small, but levels to a ~constant
171/// when sufficiently large.
172#ifndef GPUFFT_BATCH_SIZE
173#define GPUFFT_BATCH_SIZE 256
174#endif
175
176/** @brief GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to
177 * obtain meaningful timer values
178 *
179 * @details Because GPU kernel launch is asynchronous process, the timers by default may not measure
180 * the actual time used in GPU kernel execution. Defining GPU_SYNCHRONIZE_TIMERS inserts GPU
181 * synchronization calls to timers. This is off by default, because this may slow down GPU code.
182 * Turn on in order to measure more accurately the time spent in different parts of the code.
183 */
184
185#ifdef GPU_SYNCHRONIZE_TIMERS
186#if GPU_SYNCHRONIZE_TIMERS == 0
187#undef GPU_SYNCHRNONIZE_TIMERS
188#endif
189#endif
190
191
192/** @brief GPU_GLOBAL_ARG_MAX_SIZE : in some __global__functions gives the max size of variable
193 * passed directly as an argument of the function call. Larger value sizes are passed with
194 * gpuMemcopy() and a pointer. CUDA < 12.1 limits the total parameter size to 4K, >= 12.1 it is 32K.
195 * We set the default to 2K. in HIP/rocm I have not found the size. Passing as an arg is faster, but
196 * size limit is relevant only for big "matrices"
197 */
198
199#ifndef GPU_GLOBAL_ARG_MAX_SIZE
200#define GPU_GLOBAL_ARG_MAX_SIZE 2048
201#endif
202
203#endif // CUDA || HIP
204
205///////////////////////////////////////////////////////////////////////////
206// Special defines for CUDA target
207
208#if defined(CUDA)
209
210/// General number of threads in a thread block
211#ifndef N_threads
212#define N_threads 256
213#endif
214
215
216#ifndef GPU_MEMORY_POOL
217
218// CUDA_MALLOC_ASYNC
219#ifndef CUDA_MALLOC_ASYNC
220// Use async malloc only if version is large enough
221// NOTE: does not seem to work with OpenMPI, disable
222#if 0 && CUDART_VERSION >= 11020
223#define CUDA_MALLOC_ASYNC
224#endif
225
226#elif CUDA_MALLOC_ASYNC == 0
227#undef CUDA_MALLOC_ASYNC
228#endif
229
230#endif // if not GPU_MEMORY_POOL
231
232#endif // CUDA
233
234///////////////////////////////////////////////////////////////////////////
235// Same for HIP
236
237#if defined(HIP)
238
239// General number of threads in a thread block
240#ifndef N_threads
241#define N_threads 256
242#endif
243
244
245// End of GPU defines
246#endif // HIP
247
248#endif