HILA
Loading...
Searching...
No Matches
hila_gpu.cpp
1
2#include "plumbing/defs.h"
3#include "plumbing/lattice.h"
4#include "plumbing/field.h"
5#include "plumbing/backend_gpu/defs.h"
6
7// hilapp needs to transform the include files above, to make them __device__
8// callable...
9
10#ifndef HILAPP
11
12#if defined(CUDA)
13
14#include <curand_kernel.h>
15
16using gpurandState = curandState_t;
17#define gpurand_init curand_init
18#define gpurand_uniform curand_uniform
19#define gpuMemcpyToSymbol(a, b, size, c, dir) GPU_CHECK(cudaMemcpyToSymbol(a, b, size, c, dir))
20#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
21#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
22#define gpuGetLastError cudaGetLastError
23#define gpuGetErrorString cudaGetErrorString
24
25#elif defined(HIP)
26
27#include <hip/hip_runtime.h>
28#include <hiprand_kernel.h>
29
30using gpurandState = hiprandState_t;
31#define gpurand_init hiprand_init
32#define gpurand_uniform hiprand_uniform
33#define gpuMemcpyToSymbol(a, b, size, c, dir) \
34 GPU_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(a), b, size, c, dir))
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
39
40#endif
41
42// Save "constants" lattice size and volume here
43__constant__ int64_t _d_volume;
44__constant__ int _d_size[NDIM];
45#ifndef EVEN_SITES_FIRST
46__constant__ int _d_nodesize[NDIM];
47__constant__ int _d_nodemin[NDIM];
48__constant__ int _d_nodefactor[NDIM];
49#endif
50
51/* Random number generator */
52static gpurandState *gpurandstateptr;
53__constant__ gpurandState *d_gpurandstateptr;
54
55// check if rng on device is OK
56
58 return gpurandstateptr != nullptr;
59}
60
61/* Set seed on device */
62__global__ void seed_random_kernel(unsigned long long seed) {
63 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
64 // d_gpurandstateptr set now using memcpyToSymbol
65 // d_gpurandstateptr = state;
66 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
67}
68
69/* Set seed on device and host */
70void hila::initialize_device_rng(uint64_t seed) {
71 unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;
72
73#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
74 // If we have limited rng block number
75 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
76 n_blocks = GPU_RNG_THREAD_BLOCKS;
77 }
78
79 hila::out0 << "GPU random number generator initialized\n";
80 hila::out0 << "GPU random number thread blocks: " << n_blocks << " of size " << N_threads
81 << " threads\n";
82#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
83 hila::out0 << "GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
84#else
85 hila::out0 << "GPU random number generator initialized\n";
87 << "GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
88#endif
89
90 unsigned long long n_sites = n_blocks * N_threads;
91 unsigned long long myseed = seed + hila::myrank() * n_sites;
92
93 // allocate random state and copy the ptr to d_gpurandstateptr
94 gpuMalloc(&gpurandstateptr, n_sites * sizeof(gpurandState));
95 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
96 gpuMemcpyHostToDevice);
97
98#ifdef CUDA
99 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
100#else
101 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
102#endif
103 check_device_error("seed_random kernel");
104}
105
107 if (gpurandstateptr != nullptr) {
108 gpuFree(gpurandstateptr);
109 gpurandstateptr = nullptr;
110 // set d_gpurandstateptr <- nullptr.
111 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
112 gpuMemcpyHostToDevice);
113
114 // good to purge the memory pool after releasing a large chunk
115 gpu_memory_pool_purge();
116 }
117}
118
119/* Generate random numbers on device or host */
120__device__ __host__ double hila::random() {
121#ifdef __GPU_DEVICE_COMPILE__
122 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
123 return gpurand_uniform(&d_gpurandstateptr[x]);
124#else
125 return hila::host_random();
126#endif
127}
128
129// Then, define global functions loop_lattice_size() and _volume()
130__device__ __host__ int loop_lattice_size(Direction dir) {
131#ifdef __GPU_DEVICE_COMPILE__
132 return _d_size[dir];
133#else
134 return lattice.size(dir);
135#endif
136}
137__device__ __host__ CoordinateVector loop_lattice_size(void) {
138#ifdef __GPU_DEVICE_COMPILE__
140 foralldir(d) v[d] = _d_size[d];
141 return v;
142#else
143 return lattice.size();
144#endif
145}
146__device__ __host__ int64_t loop_lattice_volume(void) {
147#ifdef __GPU_DEVICE_COMPILE__
148 return _d_volume;
149#else
150 return lattice.volume();
151#endif
152}
153
154#ifndef EVEN_SITES_FIRST
155
156__device__ const CoordinateVector backend_lattice_struct::coordinates(unsigned idx) const {
158 unsigned vdiv, ndiv;
159
160 vdiv = idx;
161 for (int d = 0; d < NDIM - 1; ++d) {
162 ndiv = vdiv / _d_nodesize[d];
163 c[d] = vdiv - ndiv * _d_nodesize[d] + _d_nodemin[d];
164 vdiv = ndiv;
165 }
166 c[NDIM - 1] = vdiv + _d_nodemin[NDIM - 1];
167
168 return c;
169}
170
171__device__ int backend_lattice_struct::coordinate(unsigned idx, Direction dir) const {
172 return (idx / _d_nodefactor[dir]) % _d_nodesize[dir] + _d_nodemin[dir];
173}
174
175#endif
176
177
179 CoordinateVector *tmp;
180
181 /* Setup neighbour fields in all directions */
182 for (int d = 0; d < NDIRS; d++) {
183 // For normal boundaries
184 gpuMalloc(&(d_neighb[d]), lattice.mynode.volume() * sizeof(unsigned));
185 gpuMemcpy(d_neighb[d], lattice.neighb[d], lattice.mynode.volume() * sizeof(unsigned),
186 gpuMemcpyHostToDevice);
187
188#ifdef SPECIAL_BOUNDARY_CONDITIONS
189 // For special boundaries
190 // TODO: check this really works now!
191 const unsigned *special_neighb =
192 lattice.get_neighbour_array((Direction)d, hila::bc::ANTIPERIODIC);
193
194 if (special_neighb != lattice.neighb[d]) {
195 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() * sizeof(unsigned));
196 gpuMemcpy(d_neighb_special[d], special_neighb,
197 lattice.mynode.volume() * sizeof(unsigned), gpuMemcpyHostToDevice);
198 } else {
199 d_neighb_special[d] = d_neighb[d];
200 }
201#endif
202 }
203
204#ifdef EVEN_SITES_FIRST
205 /* Setup the location field */
206 gpuMalloc(&(d_coordinates), lattice.mynode.volume() * sizeof(CoordinateVector));
207 tmp = (CoordinateVector *)memalloc(lattice.mynode.volume() * sizeof(CoordinateVector));
208 for (unsigned i = 0; i < lattice.mynode.volume(); i++)
209 tmp[i] = lattice.coordinates(i);
210
211 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() * sizeof(CoordinateVector),
212 gpuMemcpyHostToDevice);
213 free(tmp);
214#endif
215
216 // Other backend_lattice parameters
217 field_alloc_size = lattice.field_alloc_size();
218
219 int64_t v = lattice.volume();
220 gpuMemcpyToSymbol(_d_volume, &v, sizeof(int64_t), 0, gpuMemcpyHostToDevice);
221 int s[NDIM];
222 foralldir(d) s[d] = lattice.size(d);
223 gpuMemcpyToSymbol(_d_size, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
224
225#ifndef EVEN_SITES_FIRST
226 foralldir(d) s[d] = lattice.mynode.size[d];
227 gpuMemcpyToSymbol(_d_nodesize, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
228
229 foralldir(d) s[d] = lattice.mynode.min[d];
230 gpuMemcpyToSymbol(_d_nodemin, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
231
232 foralldir(d) s[d] = lattice.mynode.size_factor[d];
233 gpuMemcpyToSymbol(_d_nodefactor, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
234
235#endif
236}
237
238void initialize_gpu(int rank, int device) {
239 int n_devices, my_device;
240
241 gpuGetDeviceCount(&n_devices);
242 check_device_error("Could not get device count");
243 // This assumes that each node has the same number of mpi ranks and GPUs
244 // TODO:generalize (if needed)
245 if (device > 0 && hila::number_of_nodes() == 1) {
246 if (device >= n_devices) {
247 hila::out0 << "-device " << device << ": too large device number, maximum "
248 << n_devices - 1 << " on this machine\n";
250 }
251
252 my_device = device;
253 } else {
254 my_device = rank % n_devices;
255 }
256
257
258 hila::out0 << "GPU devices accessible from node 0: " << n_devices << '\n';
259
260 // TODO: this only for node 0?
261 if (n_devices > 1 && rank < 6) {
262 hila::out << "GPU: MPI rank " << rank << " choosing device " << my_device << std::endl;
263 if (hila::number_of_nodes() > 6) {
264 hila::out0 << " + " << hila::number_of_nodes() - 6 << " more nodes\n";
265 }
266 }
267
268 gpuSetDevice(my_device);
269
270 // set gpu rng state to "off", to prevent accidental use
271 gpurandstateptr = nullptr;
272 // set d_gpurandstateptr <- nullptr.
273 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
274 gpuMemcpyHostToDevice);
275
276
277#if defined(CUDA_MALLOC_ASYNC)
278 // set memory pool
279 cudaMemPool_t mempool;
280 cudaDeviceGetDefaultMemPool(&mempool, my_device);
281 uint64_t threshold = UINT64_MAX;
282 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
283
284#endif
285}
286
287#ifdef CUDA
288
289#ifdef OPEN_MPI
290// here functions to inquire cuda-aware MPI defined
291#include "mpi-ext.h"
292#endif
293
294void gpu_device_info() {
295 if (hila::myrank() == 0) {
296 const int kb = 1024;
297 const int mb = kb * kb;
298
299 int driverVersion, rtVersion;
300 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
301 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
302 hila::out << "CUDA driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
303 hila::out << "CUDART_VERSION " << CUDART_VERSION << '\n';
304#if defined(CUDA_MALLOC_ASYNC)
305 if (CUDART_VERSION >= 11020) {
306 hila::out << "Using cudaMallocAsync() to allocate memory\n";
307 }
308#endif
309
310 cudaDeviceProp props;
311 int my_device;
312 GPU_CHECK(cudaGetDevice(&my_device));
313 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
314 hila::out << "Device on node rank 0 device " << my_device << ":\n";
315 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
316 << '\n';
317 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
318 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
319 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
320 hila::out << " Block registers: " << props.regsPerBlock << '\n';
321
322 hila::out << " Warp size: " << props.warpSize << '\n';
323 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
324 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
325 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
326 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
327 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
328
329 hila::out << "Threads in use: " << N_threads << '\n';
330
331// Following should be OK in open MPI
332#ifdef OPEN_MPI
333#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
334 hila::out << "OpenMPI library supports CUDA-Aware MPI\n";
335 if (MPIX_Query_cuda_support() == 1)
336 hila::out << " Runtime library supports CUDA-Aware MPI\n";
337 else {
338 hila::out << " Runtime library does not support CUDA-Aware MPI!\n";
339#if defined(GPU_AWARE_MPI)
340 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
341#endif
342 }
343#else
344 hila::out << "OpenMPI library does not support CUDA-Aware MPI\n";
345#if defined(GPU_AWARE_MPI)
346 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
347#endif
348#endif // MPIX
349#endif // OPEN_MPI
350 }
351}
352#endif
353
354#ifdef HIP
355
356void gpu_device_info() {
357 if (hila::myrank() == 0) {
358 const int kb = 1024;
359 const int mb = kb * kb;
360
361 int driverVersion, rtVersion;
362 GPU_CHECK(hipDriverGetVersion(&driverVersion));
363 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
364 hila::out << "HIP driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
365
366 hipDeviceProp_t props;
367 int my_device;
368 GPU_CHECK(hipGetDevice(&my_device));
369 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
370 hila::out << "Device on node rank 0 device " << my_device << ":\n";
371 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
372 << '\n';
373 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
374 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
375 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
376 hila::out << " Block registers: " << props.regsPerBlock << '\n';
377
378 hila::out << " Warp size: " << props.warpSize << '\n';
379 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
380 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
381 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
382 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
383 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
384 hila::out << "Threads in use: " << N_threads << '\n';
385 }
386}
387
388#endif
389
390void gpu_exit_on_error(const char *msg, const char *file, int line) {
391 gpuError code = gpuGetLastError();
392 if (gpuSuccess != code) {
393 hila::out << GPUTYPESTR << " error: " << msg << " in file " << file << " line " << line
394 << '\n';
395 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
396
398 }
399}
400
401void gpu_exit_on_error(gpuError code, const char *msg, const char *file, int line) {
402 if (gpuSuccess != code) {
403 hila::out << GPUTYPESTR << " error in command: " << msg << " in file " << file << " line "
404 << line << '\n';
405 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
406
408 }
409}
410
411#endif
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
Definition lattice.h:203
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
#define foralldir(d)
Macro to loop over (all) Direction(s)
Definition coordinates.h:78
constexpr unsigned NDIRS
Number of directions.
Definition coordinates.h:57
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
Definition hila_gpu.cpp:70
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
Definition hila_gpu.cpp:106
int myrank()
rank of this node
Definition com_mpi.cpp:235
int number_of_nodes()
how many nodes there are
Definition com_mpi.cpp:246
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
Definition hila_gpu.cpp:57
double random()
Real valued uniform random number generator.
Definition hila_gpu.cpp:120
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.