HILA
Loading...
Searching...
No Matches
params.h
Go to the documentation of this file.
1#ifndef PARAMS_H_
2#define PARAMS_H_
3/**
4 * @file params.h
5 * @brief This file contains #defined constants
6 * @details
7 * These can be overruled in application Makefile, with APP_OPTS := -DPARAMETER=value.
8 *
9 * There are two types of #define variables, True/False switches or parameter variables.
10 *
11 * True/False statements can be set with either 0 (False) or 1 (True) as -DPARAMETER=0.
12 *
13 * Parameter variables are set similary with -DPARAMETER=var where var is the chosen variable
14 */
15
16#ifdef RELEASE
17/**
18 * @brief Turn off asserts which are on by default.
19 * @details By defining either RELEASE or NDEBUG (No debug) asserts will be turned off.
20 * Static asserts naturally remain active
21 */
22#ifndef NDEBUG
23#define NDEBUG
24#endif
25#endif
26
27#ifndef NDIM
28/**
29 * @brief HILA system dimensionality
30 * @details Set's HILA dimensionality for which 4 is default. Options are 2,3,4
31 */
32#define NDIM 4
33#endif
34
35#ifndef DEFAULT_OUTPUT_NAME
36/**
37 * @def DEFAULT_OUTPUT_NAME
38 * @brief Default output file name
39 */
40#define DEFAULT_OUTPUT_NAME "output"
41#endif
42
43#ifndef EVEN_SITES_FIRST
44/**
45 * @brief EVEN_SITES_FIRST is default. To traverse sites on natural (parity invariant) order use
46 * -DEVEN_SITES_FIRST=0
47 */
48#define EVEN_SITES_FIRST
49#elif EVEN_SITES_FIRST == 0
50#undef EVEN_SITES_FIRST
51#endif
52
53/// NODE_LAYOUT_TRIVIAL or NODE_LAYOUT_BLOCK determine how MPI ranks are laid out on logical
54/// lattice. TRIVIAL lays out the lattice on logical order where x-direction runs fastest etc.
55/// if NODE_LAYOUT_BLOCK is defined, NODE_LAYOUT_BLOCK consecutive MPI ranks are laid out so that
56/// these form a compact "block" of ranks logically close togeter.
57/// Define NODE_LAYOUT_BLOCK to be the number of
58/// MPI processes within one compute node - tries to maximize the use of fast local communications.
59/// Either one of these must be defined.
60
61#ifndef NODE_LAYOUT_TRIVIAL
62#ifndef NODE_LAYOUT_BLOCK
63#define NODE_LAYOUT_BLOCK 4
64#endif
65#endif
66
67/// WRITE_BUFFER SIZE
68/// Size of the write buffer in field writes, in bytes
69/// Larger buffer -> less MPI calls in writing, but more memory
70#ifndef WRITE_BUFFER_SIZE
71#define WRITE_BUFFER_SIZE 2000000
72#endif
73
74
75// boundary conditions are "off" by default -- no need to do anything here
76// #ifndef SPECIAL_BOUNDARY_CONDITIONS
77
78///////////////////////////////////////////////////////////////////////////
79// Special defines for GPU targets
80#if defined(CUDA) || defined(HIP)
81
82/// Use gpu memory pool by default
83/// turn off by using -DGPU_MEMORY_POOL=0 in Makefile
84#ifndef GPU_MEMORY_POOL
85#define GPU_MEMORY_POOL
86#elif GPU_MEMORY_POOL == 0
87#undef GPU_MEMORY_POOL
88#endif
89
90/// GPU_AWARE_MPI
91/// By default GPU aware MPI is on. Turn it off in Makefile with -DGPU_AWARE_MPI=0
92#ifndef GPU_AWARE_MPI
93#define GPU_AWARE_MPI 1
94#elif GPU_AWARE_MPI == 0
95#undef GPU_AWARE_MPI
96#endif
97
98/// GPU_RNG_THREAD_BLOCKS
99/// Number of thread blocks (of N_threads threads) to use in onsites()-loops containing random
100/// numbers. GPU_RNG_THREAD_BLOCKS=0 or undefined means use one RNG on each lattice site, and the
101/// thread block number is not restricted. RNG takes about 48 B/generator (with XORWOW). When
102/// GPU_RNG_THREAD_BLOCKS > 0 only (N_threads * GPU_RNG_THREAD_BLOCKS) generators are in use, which
103/// reduces the memory footprint substantially (and bandwidth demand) Too small number slows down
104/// onsites()-loops containing RNGs, because less threads are active. Example:
105/// Field<Vector<4,double>> vfield;
106/// onsites(ALL) {
107/// vfield[X].gaussian_random(); // there's RNG here, so this onsites() is handled by
108/// // GPU_RNG_THREAD_BLOCKS thread blocks
109/// }
110/// GPU_RNG_THREAD_BLOCKS<0 disables GPU random numbers entirely, and loops like above will crash if
111/// executed. hilapp will emit a warning, but program is compiled
112///
113/// Default: 32 seems to be OK compromise. Can be set to 0 if memory is not a problem.
114
115#ifndef GPU_RNG_THREAD_BLOCKS
116#define GPU_RNG_THREAD_BLOCKS 32
117#endif
118
119/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS
120/// # of thread blocks (of N_threads threads) used in ReductionVector (weighted histogram) ops.
121/// A value > 0 for GPU_VECTOR_REDUCTION_THREAD_BLOCKS means that the onsites-loop where the
122/// reduction is done is handled by GPU_VECTOR_REDUCTION_THREAD_BLOCKS thread blocks of N_threads
123/// threads. Each thread handles its own histogram, thus there are
124/// (GPU_VECTOR_REDUCTION_THREAD_BLOCKS*N_threads) working copies of the histogram which are then
125/// combined. Too small value slows the loop where this happens computation, too large uses
126/// (temporarily) more memory. Example:
127/// ReductionVector<double> rv(100);
128/// Field<int> index;
129/// ... (set index to values 0 .. 99)
130/// onsites(ALL) {
131/// rv[index[X]] += ..
132/// ..
133/// }
134///
135/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 or undefined means that the thread block number is not
136/// restricted and only a single histogram is used with atomic operations (atomicAdd). This
137/// can be slower, but the performance is GPU hardware/driver dependent. In some
138/// cases GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 turns out to be faster.
139///
140/// Default: 32 is currently OK compromise (32 thread blocks)
141
142#ifndef GPU_VECTOR_REDUCTION_THREAD_BLOCKS
143#define GPU_VECTOR_REDUCTION_THREAD_BLOCKS 32
144#endif
145
146/// GPUFFT_BATCH_SIZE
147/// How many complex fft's in parallel - large value can be faster, small uses less memory.
148/// Performance is reduced if the value is too small, but levels to a ~constant
149/// when sufficiently large.
150#ifndef GPUFFT_BATCH_SIZE
151#define GPUFFT_BATCH_SIZE 256
152#endif
153
154/** @brief GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to
155 * obtain meaningful timer values
156 *
157 * @details Because GPU kernel launch is asynchronous process, the timers by default may not measure
158 * the actual time used in GPU kernel execution. Defining GPU_SYNCHRONIZE_TIMERS inserts GPU
159 * synchronization calls to timers. This is off by default, because this may slow down GPU code.
160 * Turn on in order to measure more accurately the time spent in different parts of the code.
161 */
162
163#ifdef GPU_SYNCHRONIZE_TIMERS
164#if GPU_SYNCHRONIZE_TIMERS == 0
165#undef GPU_SYNCHRNONIZE_TIMERS
166#endif
167#endif
168
169
170/** @brief GPU_GLOBAL_ARG_MAX_SIZE : in some __global__functions gives the max size of variable
171 * passed directly as an argument of the function call. Larger value sizes are passed with
172 * gpuMemcopy() and a pointer. CUDA < 12.1 limits the total parameter size to 4K, >= 12.1 it is 32K.
173 * We set the default to 2K. in HIP/rocm I have not found the size. Passing as an arg is faster, but
174 * size limit is relevant only for big "matrices"
175 */
176
177#ifndef GPU_GLOBAL_ARG_MAX_SIZE
178#define GPU_GLOBAL_ARG_MAX_SIZE 2048
179#endif
180
181#endif // CUDA || HIP
182
183///////////////////////////////////////////////////////////////////////////
184// Special defines for CUDA target
185
186#if defined(CUDA)
187
188/// General number of threads in a thread block
189#ifndef N_threads
190#define N_threads 256
191#endif
192
193
194#ifndef GPU_MEMORY_POOL
195
196// CUDA_MALLOC_ASYNC
197#ifndef CUDA_MALLOC_ASYNC
198// Use async malloc only if version is large enough
199// NOTE: does not seem to work with OpenMPI, disable
200#if 0 && CUDART_VERSION >= 11020
201#define CUDA_MALLOC_ASYNC
202#endif
203
204#elif CUDA_MALLOC_ASYNC == 0
205#undef CUDA_MALLOC_ASYNC
206#endif
207
208#endif // if not GPU_MEMORY_POOL
209
210#endif // CUDA
211
212///////////////////////////////////////////////////////////////////////////
213// Same for HIP
214
215#if defined(HIP)
216
217// General number of threads in a thread block
218#ifndef N_threads
219#define N_threads 256
220#endif
221
222
223// End of GPU defines
224#endif // HIP
225
226#endif