HILA
Loading...
Searching...
No Matches
params.h
Go to the documentation of this file.
1#ifndef PARAMS_H_
2#define PARAMS_H_
3/**
4 * @file params.h
5 * @brief This file contains #defined constants
6 * @details
7 * These can be overruled in application Makefile, with APP_OPTS := -DPARAMETER=value.
8 *
9 * There are two types of #define variables, True/False switches or parameter variables.
10 *
11 * True/False statements can be set with either 0 (False) or 1 (True) as -DPARAMETER=0.
12 *
13 * Parameter variables are set similary with -DPARAMETER=var where var is the chosen variable
14 */
15
16#ifdef RELEASE
17/**
18 * @brief Turn off asserts which are on by default.
19 * @details By defining either RELEASE or NDEBUG (No debug) asserts will be turned off.
20 * Static asserts naturally remain active
21 */
22#ifndef NDEBUG
23#define NDEBUG
24#endif
25#endif
26
27#ifndef NDIM
28/**
29 * @brief HILA system dimensionality
30 * @details Set's HILA dimensionality for which 4 is default. Options are 2,3,4
31 */
32#define NDIM 4
33#endif
34
35#ifndef DEFAULT_OUTPUT_NAME
36/**
37 * @def DEFAULT_OUTPUT_NAME
38 * @brief Default output file name
39 */
40#define DEFAULT_OUTPUT_NAME "output"
41#endif
42
43#ifndef EVEN_SITES_FIRST
44/**
45 * @brief EVEN_SITES_FIRST is default. To traverse odd sites first set -DEVEN_SITES_FIRST=0
46 */
47#define EVEN_SITES_FIRST
48#elif EVEN_SITES_FIRST == 0
49#undef EVEN_SITES_FIRST
50#endif
51
52/// NODE_LAYOUT_TRIVIAL or NODE_LAYOUT_BLOCK determine how MPI ranks are laid out on logical
53/// lattice. TRIVIAL lays out the lattice on logical order where x-direction runs fastest etc.
54/// if NODE_LAYOUT_BLOCK is defined, NODE_LAYOUT_BLOCK consecutive MPI ranks are laid out so that
55/// these form a compact "block" of ranks logically close togeter.
56/// Define NODE_LAYOUT_BLOCK to be the number of
57/// MPI processes within one compute node - tries to maximize the use of fast local communications.
58/// Either one of these must be defined.
59
60#ifndef NODE_LAYOUT_TRIVIAL
61#ifndef NODE_LAYOUT_BLOCK
62#define NODE_LAYOUT_BLOCK 4
63#endif
64#endif
65
66// Size of the write buffer in field writes, in bytes
67// Larger buffer -> less MPI calls in writing, but more memory
68#ifndef WRITE_BUFFER_SIZE
69#define WRITE_BUFFER_SIZE 2000000
70#endif
71
72
73// boundary conditions are "off" by default -- no need to do anything here
74// #ifndef SPECIAL_BOUNDARY_CONDITIONS
75
76///////////////////////////////////////////////////////////////////////////
77// Special defines for GPU targets
78#if defined(CUDA) || defined(HIP)
79
80// Use gpu memory pool by default
81// set off by using -DGPU_MEMORY_POOL=0 in Makefile
82#ifndef GPU_MEMORY_POOL
83#define GPU_MEMORY_POOL
84#elif GPU_MEMORY_POOL == 0
85#undef GPU_MEMORY_POOL
86#endif
87
88// Undef cuda/hip -aware mpi at makefile with -DGPU_AWARE_MPI=0
89#ifndef GPU_AWARE_MPI
90#define GPU_AWARE_MPI 1
91#elif GPU_AWARE_MPI == 0
92#undef GPU_AWARE_MPI
93#endif
94
95// GPU_RNG_THREAD_BLOCKS
96// Number of thread blocks (of N_threads threads) to use in onsites()-loops containing random
97// numbers. GPU_RNG_THREAD_BLOCKS=0 or undefined means use one RNG on each lattice site, and the
98// thread block number is not restricted. RNG takes about 48 B/generator (with XORWOW). When
99// GPU_RNG_THREAD_BLOCKS > 0 only (N_threads * GPU_RNG_THREAD_BLOCKS) generators are in use, which
100// reduces the memory footprint substantially (and bandwidth demand) Too small number slows down
101// onsites()-loops containing RNGs, because less threads are active. Example:
102// Field<Vector<4,double>> vfield;
103// onsites(ALL) {
104// vfield[X].gaussian_random(); // there's RNG here, so this onsites() is handled by
105// // GPU_RNG_THREAD_BLOCKS thread blocks
106// }
107// GPU_RNG_THREAD_BLOCKS<0 disables GPU random numbers entirely, and loops like above will crash if
108// executed. hilapp will emit a warning, but program is compiled
109
110#ifndef GPU_RNG_THREAD_BLOCKS
111#define GPU_RNG_THREAD_BLOCKS 32
112#endif
113
114// GPU_VECTOR_REDUCTION_THREAD_BLOCKS
115// # of thread blocks (of N_threads threads) used in ReductionVector (weighted histogram) ops.
116// A value > 0 for GPU_VECTOR_REDUCTION_THREAD_BLOCKS means that the onsites-loop where the
117// reduction is done is handled by GPU_VECTOR_REDUCTION_THREAD_BLOCKS thread blocks of N_threads
118// threads. Each thread handles its own histogram, thus there are
119// (GPU_VECTOR_REDUCTION_THREAD_BLOCKS*N_threads) working copies of the histogram which are then
120// combined. Too small value slows the loop where this happens computation, too large uses
121// (temporarily) more memory. Example:
122// ReductionVector<double> rv(100);
123// Field<int> index;
124// ... (set index to values 0 .. 99)
125// onsites(ALL) {
126// rv[index[X]] += ..
127// ..
128// }
129//
130// GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 or undefined means that the thread block number is not
131// restricted and only a single histogram is used with atomic operations (atomicAdd). This
132// can be slower, but the performance is GPU hardware/driver dependent. In some
133// cases GPU_VECTOR_REDUCTION_THREAD_BLOCKS = 0 turns out to be faster.
134//
135// Default: 32 is currently OK compromise (32 thread blocks)
136
137#ifndef GPU_VECTOR_REDUCTION_THREAD_BLOCKS
138#define GPU_VECTOR_REDUCTION_THREAD_BLOCKS 32
139#endif
140
141// GPUFFT_BATCH_SIZE:
142// How many complex fft's in parallel - large value faster, small less memory.
143// Performance is reduced if the value is too small, but levels to a ~constant
144// when sufficiently large.
145#ifndef GPUFFT_BATCH_SIZE
146#define GPUFFT_BATCH_SIZE 256
147#endif
148
149/** @brief GPU_SYNCHRONIZE_TIMERS : if set and !=0 synchronize GPU on timer calls, in order to
150 * obtain meaningful timer values
151 *
152 * @details Because GPU kernel launch is asynchronous process, the timers by default may not measure
153 * the actual time used in GPU kernel execution. Defining GPU_SYNCHRONIZE_TIMERS inserts GPU
154 * synchronization calls to timers. This is off by default, because this may slow down GPU code.
155 * Turn on in order to measure more accurately the time spent in different parts of the code.
156 */
157
158#ifdef GPU_SYNCHRONIZE_TIMERS
159#if GPU_SYNCHRONIZE_TIMERS == 0
160#undef GPU_SYNCHRNONIZE_TIMERS
161#endif
162#endif
163
164
165/** @brief GPU_GLOBAL_ARG_MAX_SIZE : in some __global__functions gives the max size of variable
166 * passed directly as an argument of the function call. Larger value sizes are passed with
167 * gpuMemcopy() and a pointer. CUDA < 12.1 limits the total parameter size to 4K, >= 12.1 it is 32K.
168 * We set the default to 2K. in HIP/rocm I have not found the size. Passing as an arg is faster, but
169 * size limit is relevant only for big "matrices"
170 */
171
172#ifndef GPU_GLOBAL_ARG_MAX_SIZE
173#define GPU_GLOBAL_ARG_MAX_SIZE 2048
174#endif
175
176#endif // CUDA || HIP
177
178///////////////////////////////////////////////////////////////////////////
179// Special defines for CUDA target
180
181#if defined(CUDA)
182
183// General # of threads
184#ifndef N_threads
185#define N_threads 256
186#endif
187
188
189#ifndef GPU_MEMORY_POOL
190
191// CUDA_MALLOC_ASYNC
192#ifndef CUDA_MALLOC_ASYNC
193// Use async malloc only if version is large enough
194// NOTE: does not seem to work with OpenMPI, disable
195#if 0 && CUDART_VERSION >= 11020
196#define CUDA_MALLOC_ASYNC
197#endif
198
199#elif CUDA_MALLOC_ASYNC == 0
200#undef CUDA_MALLOC_ASYNC
201#endif
202
203#endif // if not GPU_MEMORY_POOL
204
205#endif // CUDA
206
207///////////////////////////////////////////////////////////////////////////
208// Same for HIP
209
210#if defined(HIP)
211
212// General # of threads
213#ifndef N_threads
214#define N_threads 256
215#endif
216
217
218// End of GPU defines
219#endif // HIP
220
221#endif