HILA
Loading...
Searching...
No Matches
hila_gpu.cpp
1
2#include "plumbing/defs.h"
3#include "plumbing/lattice.h"
4#include "plumbing/field.h"
5#include "plumbing/backend_gpu/defs.h"
6
7// hilapp needs to transform the include files above, to make them __device__
8// callable...
9
10#ifndef HILAPP
11
12#if defined(CUDA)
13
14#include <curand_kernel.h>
15
16using gpurandState = curandState_t;
17#define gpurand_init curand_init
18#define gpurand_uniform curand_uniform
19#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
20#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
21#define gpuGetLastError cudaGetLastError
22#define gpuGetErrorString cudaGetErrorString
23
24#elif defined(HIP)
25
26#include <hip/hip_runtime.h>
27#include <hiprand/hiprand_kernel.h>
28
29using gpurandState = hiprandState_t;
30#define gpurand_init hiprand_init
31#define gpurand_uniform hiprand_uniform
32#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
33#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
34#define gpuGetLastError hipGetLastError
35#define gpuGetErrorString hipGetErrorString
36
37#endif
38
39// Save "constants" lattice size and volume here
40__constant__ int64_t _d_volume;
41// __constant__ int _d_size[NDIM];
42__constant__ CoordinateVector _d_size;
43#ifndef EVEN_SITES_FIRST
44__constant__ int _d_nodesize[NDIM];
45__constant__ int _d_nodemin[NDIM];
46__constant__ int _d_nodefactor[NDIM];
47#endif
48
49/* Random number generator */
50static gpurandState *gpurandstateptr;
51__constant__ gpurandState *d_gpurandstateptr;
52
53// check if rng on device is OK
54
56 return gpurandstateptr != nullptr;
57}
58
59/* Set seed on device */
60__global__ void seed_random_kernel(unsigned long long seed) {
61 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
62 // d_gpurandstateptr set now using memcpyToSymbol
63 // d_gpurandstateptr = state;
64 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
65}
66
67/* Set seed on device and host */
68void hila::initialize_device_rng(uint64_t seed) {
69 unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;
70
71#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
72 // If we have limited rng block number
73 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
74 n_blocks = GPU_RNG_THREAD_BLOCKS;
75 }
76
77 hila::out0 << "GPU random number generator initialized\n";
78 hila::out0 << "GPU random number thread blocks: " << n_blocks << " of size " << N_threads
79 << " threads\n";
80#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
81 hila::out0 << "GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
82#else
83 hila::out0 << "GPU random number generator initialized\n";
85 << "GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
86#endif
87
88 unsigned long long n_sites = n_blocks * N_threads;
89 unsigned long long myseed = seed + hila::myrank() * n_sites;
90
91 // allocate random state and copy the ptr to d_gpurandstateptr
92 gpuMalloc(&gpurandstateptr, n_sites * sizeof(gpurandState));
93 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
94 gpuMemcpyHostToDevice);
95
96#ifdef CUDA
97 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
98#else
99 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
100#endif
101 check_device_error("seed_random kernel");
102}
103
105 if (gpurandstateptr != nullptr) {
106 gpuFree(gpurandstateptr);
107 gpurandstateptr = nullptr;
108 // set d_gpurandstateptr <- nullptr.
109 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
110 gpuMemcpyHostToDevice);
111
112 // good to purge the memory pool after releasing a large chunk
113 gpu_memory_pool_purge();
114 }
115}
116
117/* Generate random numbers on device or host */
118__device__ __host__ double hila::random() {
119#ifdef __GPU_DEVICE_COMPILE__
120 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
121 return gpurand_uniform(&d_gpurandstateptr[x]);
122#else
123 return hila::host_random();
124#endif
125}
126
127// Then, define global functions loop_lattice_size() and _volume()
128__device__ __host__ int loop_lattice_size(Direction dir) {
129#ifdef __GPU_DEVICE_COMPILE__
130 return _d_size[dir];
131#else
132 return lattice.size(dir);
133#endif
134}
135
136__device__ __host__ CoordinateVector loop_lattice_size(void) {
137#ifdef __GPU_DEVICE_COMPILE__
138 // CoordinateVector v;
139 // foralldir(d) v[d] = _d_size[d];
140 // return v;
141 return _d_size;
142#else
143 return lattice.size();
144#endif
145}
146
147__device__ __host__ int64_t loop_lattice_volume(void) {
148#ifdef __GPU_DEVICE_COMPILE__
149 return _d_volume;
150#else
151 return lattice.volume();
152#endif
153}
154
155#ifndef EVEN_SITES_FIRST
156
157__device__ const CoordinateVector backend_lattice_struct::coordinates(unsigned idx) const {
159 unsigned vdiv, ndiv;
160
161 vdiv = idx;
162 for (int d = 0; d < NDIM - 1; ++d) {
163 ndiv = vdiv / _d_nodesize[d];
164 c[d] = vdiv - ndiv * _d_nodesize[d] + _d_nodemin[d];
165 vdiv = ndiv;
166 }
167 c[NDIM - 1] = vdiv + _d_nodemin[NDIM - 1];
168
169 return c;
170}
171
172__device__ int backend_lattice_struct::coordinate(unsigned idx, Direction dir) const {
173 return (idx / _d_nodefactor[dir]) % _d_nodesize[dir] + _d_nodemin[dir];
174}
175
176#endif
177
178
180 CoordinateVector *tmp;
181
182 /* Setup neighbour fields in all directions */
183 for (int d = 0; d < NDIRS; d++) {
184 // For normal boundaries
185 gpuMalloc(&(d_neighb[d]), lattice.mynode.volume() * sizeof(unsigned));
186 gpuMemcpy(d_neighb[d], lattice.neighb[d], lattice.mynode.volume() * sizeof(unsigned),
187 gpuMemcpyHostToDevice);
188
189#ifdef SPECIAL_BOUNDARY_CONDITIONS
190 // For special boundaries
191 // TODO: check this really works now!
192 const unsigned *special_neighb =
193 lattice.get_neighbour_array((Direction)d, hila::bc::ANTIPERIODIC);
194
195 if (special_neighb != lattice.neighb[d]) {
196 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() * sizeof(unsigned));
197 gpuMemcpy(d_neighb_special[d], special_neighb,
198 lattice.mynode.volume() * sizeof(unsigned), gpuMemcpyHostToDevice);
199 } else {
200 d_neighb_special[d] = d_neighb[d];
201 }
202#endif
203 }
204
205#ifdef EVEN_SITES_FIRST
206 /* Setup the location field */
207 gpuMalloc(&(d_coordinates), lattice.mynode.volume() * sizeof(CoordinateVector));
208 tmp = (CoordinateVector *)memalloc(lattice.mynode.volume() * sizeof(CoordinateVector));
209 for (unsigned i = 0; i < lattice.mynode.volume(); i++)
210 tmp[i] = lattice.coordinates(i);
211
212 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() * sizeof(CoordinateVector),
213 gpuMemcpyHostToDevice);
214 free(tmp);
215#endif
216
217 // Other backend_lattice parameters
218 field_alloc_size = lattice.field_alloc_size();
219
220 int64_t v = lattice.volume();
221 gpuMemcpyToSymbol(_d_volume, &v, sizeof(int64_t), 0, gpuMemcpyHostToDevice);
222 int s[NDIM];
223 foralldir(d) s[d] = lattice.size(d);
224 gpuMemcpyToSymbol(_d_size, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
225
226#ifndef EVEN_SITES_FIRST
227 foralldir(d) s[d] = lattice.mynode.size[d];
228 gpuMemcpyToSymbol(_d_nodesize, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
229
230 foralldir(d) s[d] = lattice.mynode.min[d];
231 gpuMemcpyToSymbol(_d_nodemin, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
232
233 foralldir(d) s[d] = lattice.mynode.size_factor[d];
234 gpuMemcpyToSymbol(_d_nodefactor, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
235
236#endif
237}
238
239void initialize_gpu(int rank, int device) {
240 int n_devices, my_device;
241
242 gpuGetDeviceCount(&n_devices);
243 check_device_error("Could not get device count");
244 // This assumes that each node has the same number of mpi ranks and GPUs
245 // TODO:generalize (if needed)
246 if (device > 0 && hila::number_of_nodes() == 1) {
247 if (device >= n_devices) {
248 hila::out0 << "-device " << device << ": too large device number, maximum "
249 << n_devices - 1 << " on this machine\n";
251 }
252
253 my_device = device;
254 } else {
255 my_device = rank % n_devices;
256 }
257
258
259 hila::out0 << "GPU devices accessible from node 0: " << n_devices << '\n';
260
261 // TODO: this only for node 0?
262 if (n_devices > 1 && rank < 6) {
263 hila::out << "GPU: MPI rank " << rank << " choosing device " << my_device << std::endl;
264 if (hila::number_of_nodes() > 6) {
265 hila::out0 << " + " << hila::number_of_nodes() - 6 << " more nodes\n";
266 }
267 }
268
269 gpuSetDevice(my_device);
270
271 // set gpu rng state to "off", to prevent accidental use
272 gpurandstateptr = nullptr;
273 // set d_gpurandstateptr <- nullptr.
274 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
275 gpuMemcpyHostToDevice);
276
277
278#if defined(CUDA_MALLOC_ASYNC)
279 // set memory pool
280 cudaMemPool_t mempool;
281 cudaDeviceGetDefaultMemPool(&mempool, my_device);
282 uint64_t threshold = UINT64_MAX;
283 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
284
285#endif
286}
287
288#ifdef CUDA
289
290#ifdef OPEN_MPI
291// here functions to inquire cuda-aware MPI defined
292#include "mpi-ext.h"
293#endif
294
295void gpu_device_info() {
296 if (hila::myrank() == 0) {
297 const int kb = 1024;
298 const int mb = kb * kb;
299
300 int driverVersion, rtVersion;
301 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
302 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
303 hila::out << "CUDA driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
304 hila::out << "CUDART_VERSION " << CUDART_VERSION << '\n';
305#if defined(CUDA_MALLOC_ASYNC)
306 if (CUDART_VERSION >= 11020) {
307 hila::out << "Using cudaMallocAsync() to allocate memory\n";
308 }
309#endif
310
311 cudaDeviceProp props;
312 int my_device;
313 GPU_CHECK(cudaGetDevice(&my_device));
314 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
315 hila::out << "Device on node rank 0 device " << my_device << ":\n";
316 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
317 << '\n';
318 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
319 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
320 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
321 hila::out << " Block registers: " << props.regsPerBlock << '\n';
322
323 hila::out << " Warp size: " << props.warpSize << '\n';
324 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
325 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
326 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
327 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
328 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
329
330 hila::out << "Threads in use: " << N_threads << '\n';
331
332// Following should be OK in open MPI
333#ifdef OPEN_MPI
334#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
335 hila::out << "OpenMPI library supports CUDA-Aware MPI\n";
336 if (MPIX_Query_cuda_support() == 1)
337 hila::out << " Runtime library supports CUDA-Aware MPI\n";
338 else {
339 hila::out << " Runtime library does not support CUDA-Aware MPI!\n";
340#if defined(GPU_AWARE_MPI)
341 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
342#endif
343 }
344#else
345 hila::out << "OpenMPI library does not support CUDA-Aware MPI\n";
346#if defined(GPU_AWARE_MPI)
347 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
348#endif
349#endif // MPIX
350#endif // OPEN_MPI
351 }
352}
353#endif
354
355#ifdef HIP
356
357void gpu_device_info() {
358 if (hila::myrank() == 0) {
359 const int kb = 1024;
360 const int mb = kb * kb;
361
362 int driverVersion, rtVersion;
363 GPU_CHECK(hipDriverGetVersion(&driverVersion));
364 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
365 hila::out << "HIP driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
366
367 hipDeviceProp_t props;
368 int my_device;
369 GPU_CHECK(hipGetDevice(&my_device));
370 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
371 hila::out << "Device on node rank 0 device " << my_device << ":\n";
372 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
373 << '\n';
374 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
375 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
376 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
377 hila::out << " Block registers: " << props.regsPerBlock << '\n';
378
379 hila::out << " Warp size: " << props.warpSize << '\n';
380 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
381 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
382 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
383 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
384 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
385 hila::out << "Threads in use: " << N_threads << '\n';
386 }
387}
388
389#endif
390
391void gpu_exit_on_error(const char *msg, const char *file, int line) {
392 gpuError code = gpuGetLastError();
393 if (gpuSuccess != code) {
394 hila::out << GPUTYPESTR << " error: " << msg << " in file " << file << " line " << line
395 << '\n';
396 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
397
399 }
400}
401
402void gpu_exit_on_error(gpuError code, const char *msg, const char *file, int line) {
403 if (gpuSuccess != code) {
404 hila::out << GPUTYPESTR << " error in command: " << msg << " in file " << file << " line "
405 << line << '\n';
406 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
407
409 }
410}
411
412#endif
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
Definition lattice.h:203
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
#define foralldir(d)
Macro to loop over (all) Direction(s)
Definition coordinates.h:78
constexpr unsigned NDIRS
Number of directions.
Definition coordinates.h:57
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
Definition hila_gpu.cpp:68
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
Definition hila_gpu.cpp:104
int myrank()
rank of this node
Definition com_mpi.cpp:235
int number_of_nodes()
how many nodes there are
Definition com_mpi.cpp:246
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
Definition hila_gpu.cpp:55
double random()
Real valued uniform random number generator.
Definition hila_gpu.cpp:118
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.