HILA
Loading...
Searching...
No Matches
params.h
Go to the documentation of this file.
1#ifndef PARAMS_H_
2#define PARAMS_H_
3/**
4 * @file params.h
5 * @brief This file contains #defined constants
6 * @details
7 * These can be overruled in application Makefile, with APP_OPTS := -DPARAMETER=value.
8 *
9 * There are two types of #define variables, True/False switches or parameter variables.
10 *
11 * True/False statements can be set with either 0 (False) or 1 (True) as -DPARAMETER=0.
12 *
13 * Parameter variables are set similary with -DPARAMETER=var where var is the chosen variable
14 */
15
16#ifdef RELEASE
17/**
18 * @brief Turn off asserts which are on by default.
19 * @details By defining either RELEASE or NDEBUG (No debug) asserts will be turned off.
20 * Static asserts naturally remain active
21 */
22#ifndef NDEBUG
23#define NDEBUG
24#endif
25#endif
26
27#ifndef NDIM
28/**
29 * @brief HILA system dimensionality
30 * @details Set's HILA dimensionality for which 4 is default. Options are 2,3,4
31 */
32#define NDIM 4
33#endif
34
35#ifndef DEFAULT_OUTPUT_NAME
36/**
37 * @def DEFAULT_OUTPUT_NAME
38 * @brief Default output file name
39 */
40#define DEFAULT_OUTPUT_NAME "output"
41#endif
42
43#ifndef EVEN_SITES_FIRST
44/**
45 * @brief EVEN_SITES_FIRST is default. To traverse odd sites first set -DEVEN_SITES_FIRST=0
46 */
47#define EVEN_SITES_FIRST
48#elif EVEN_SITES_FIRST == 0
49#undef EVEN_SITES_FIRST
50#endif
51
52/// NODE_LAYOUT_TRIVIAL or NODE_LAYOUT_BLOCK determine how MPI ranks are laid out on logical
53/// lattice. TRIVIAL lays out the lattice on logical order where x-direction runs fastest etc.
54/// if NODE_LAYOUT_BLOCK is defined, NODE_LAYOUT_BLOCK consecutive MPI ranks are laid out so that
55/// these form a compact "block" of ranks logically close togeter.
56/// Define NODE_LAYOUT_BLOCK to be the number of
57/// MPI processes within one compute node - tries to maximize the use of fast local communications.
58/// Either one of these must be defined.
59
60#ifndef NODE_LAYOUT_TRIVIAL
61#ifndef NODE_LAYOUT_BLOCK
62#define NODE_LAYOUT_BLOCK 4
63#endif
64#endif
65
66/// WRITE_BUFFER SIZE
67/// Size of the write buffer in field writes, in bytes
68/// Larger buffer -> less MPI calls in writing, but more memory
69#ifndef WRITE_BUFFER_SIZE
70#define WRITE_BUFFER_SIZE 2000000
71#endif
72
73
74// boundary conditions are "off" by default -- no need to do anything here
75// #ifndef SPECIAL_BOUNDARY_CONDITIONS
76
77///////////////////////////////////////////////////////////////////////////
78// Special defines for GPU targets
79#if defined(CUDA) || defined(HIP)
80
81/// Use gpu memory pool by default
82/// turn off by using -DGPU_MEMORY_POOL=0 in Makefile
83#ifndef GPU_MEMORY_POOL
84#define GPU_MEMORY_POOL
85#elif GPU_MEMORY_POOL == 0
86#undef GPU_MEMORY_POOL
87#endif
88
89/// GPU_AWARE_MPI
90/// By default GPU aware MPI is on. Turn it off in Makefile with -DGPU_AWARE_MPI=0
91#ifndef GPU_AWARE_MPI
92#define GPU_AWARE_MPI 1
93#elif GPU_AWARE_MPI == 0
94#undef GPU_AWARE_MPI
95#endif
96
97/// GPU_RNG_THREAD_BLOCKS
98/// Number of thread blocks (of N_threads threads) to use in onsites()-loops containing random
99/// numbers. GPU_RNG_THREAD_BLOCKS=0 or undefined means use one RNG on each lattice site, and the
100/// thread block number is not restricted. RNG takes about 48 B/generator (with XORWOW). When
101/// GPU_RNG_THREAD_BLOCKS > 0 only (N_threads * GPU_RNG_THREAD_BLOCKS) generators are in use, which
102/// reduces the memory footprint substantially (and bandwidth demand) Too small number slows down
103/// onsites()-loops containing RNGs, because less threads are active. Example:
104/// Field<Vector<4,double>> vfield;
105/// onsites(ALL) {
106/// vfield[X].gaussian_random(); // there's RNG here, so this onsites() is handled by
107/// // GPU_RNG_THREAD_BLOCKS thread blocks
108/// }
109/// GPU_RNG_THREAD_BLOCKS<0 disables GPU random numbers entirely, and loops like above will crash if
110/// executed. hilapp will emit a warning, but program is compiled
111///
112/// Default: 32 seems to be OK compromise. Can be set to 0 if memory is not a problem.
113
114#ifndef GPU_RNG_THREAD_BLOCKS
115#define GPU_RNG_THREAD_BLOCKS 32
116#endif
117
118/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS
119/// # of thread blocks (of N_threads threads) used in ReductionVector (weighted histogram) ops.
120/// A value > 0 for GPU_VECTOR_REDUCTION_THREAD_BLOCKS means that the onsites-loop where the
121/// reduction is done is handled by GPU_VECTOR_REDUCTION_THREAD_BLOCKS thread blocks of N_threads
122/// threads. Each thread handles its own histogram, thus there are
123/// (GPU_VECTOR_REDUCTION_THREAD_BLOCKS*N_threads) working copies of the histogram which are then
124/// combined. Too small value slows the loop where this happens computation, too large uses
125/// (temporarily) more memory. Example:
126/// ReductionVector<double> rv(100);
127/// Field<int> index;
128/// ... (set index to values 0 .. 99)
129/// onsites(ALL) {
130/// rv[index[X]] += ..
131/// ..
132/// }
133///
134/// GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 or undefined means that the thread block number is not
135/// restricted and only a single histogram is used with atomic operations (atomicAdd). This
136/// can be slower, but the performance is GPU hardware/driver dependent. In some
137/// cases GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 turns out to be faster.
138///
139/// Default: 32 is currently OK compromise (32 thread blocks)
140
141#ifndef GPU_VECTOR_REDUCTION_THREAD_BLOCKS
142#define GPU_VECTOR_REDUCTION_THREAD_BLOCKS 32
143#endif
144
145/// GPUFFT_BATCH_SIZE
146/// How many complex fft's in parallel - large value can be faster, small uses less memory.
147/// Performance is reduced if the value is too small, but levels to a ~constant
148/// when sufficiently large.
149#ifndef GPUFFT_BATCH_SIZE
150#define GPUFFT_BATCH_SIZE 256
151#endif
152
153/** @brief GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to
154 * obtain meaningful timer values
155 *
156 * @details Because GPU kernel launch is asynchronous process, the timers by default may not measure
157 * the actual time used in GPU kernel execution. Defining GPU_SYNCHRONIZE_TIMERS inserts GPU
158 * synchronization calls to timers. This is off by default, because this may slow down GPU code.
159 * Turn on in order to measure more accurately the time spent in different parts of the code.
160 */
161
162#ifdef GPU_SYNCHRONIZE_TIMERS
163#if GPU_SYNCHRONIZE_TIMERS == 0
164#undef GPU_SYNCHRNONIZE_TIMERS
165#endif
166#endif
167
168
169/** @brief GPU_GLOBAL_ARG_MAX_SIZE : in some __global__functions gives the max size of variable
170 * passed directly as an argument of the function call. Larger value sizes are passed with
171 * gpuMemcopy() and a pointer. CUDA < 12.1 limits the total parameter size to 4K, >= 12.1 it is 32K.
172 * We set the default to 2K. in HIP/rocm I have not found the size. Passing as an arg is faster, but
173 * size limit is relevant only for big "matrices"
174 */
175
176#ifndef GPU_GLOBAL_ARG_MAX_SIZE
177#define GPU_GLOBAL_ARG_MAX_SIZE 2048
178#endif
179
180#endif // CUDA || HIP
181
182///////////////////////////////////////////////////////////////////////////
183// Special defines for CUDA target
184
185#if defined(CUDA)
186
187/// General number of threads in a thread block
188#ifndef N_threads
189#define N_threads 256
190#endif
191
192
193#ifndef GPU_MEMORY_POOL
194
195// CUDA_MALLOC_ASYNC
196#ifndef CUDA_MALLOC_ASYNC
197// Use async malloc only if version is large enough
198// NOTE: does not seem to work with OpenMPI, disable
199#if 0 && CUDART_VERSION >= 11020
200#define CUDA_MALLOC_ASYNC
201#endif
202
203#elif CUDA_MALLOC_ASYNC == 0
204#undef CUDA_MALLOC_ASYNC
205#endif
206
207#endif // if not GPU_MEMORY_POOL
208
209#endif // CUDA
210
211///////////////////////////////////////////////////////////////////////////
212// Same for HIP
213
214#if defined(HIP)
215
216// General number of threads in a thread block
217#ifndef N_threads
218#define N_threads 256
219#endif
220
221
222// End of GPU defines
223#endif // HIP
224
225#endif