HILA
Loading...
Searching...
No Matches
hila_gpu.cpp
1
2// Define below to deactivate "extern" in global var defs
3#define IN_HILA_GPU
4
5#include "plumbing/defs.h"
6#include "plumbing/lattice.h"
7#include "plumbing/field.h"
8#include "plumbing/backend_gpu/defs.h"
9
10// hilapp needs to transform the include files above, to make them __device__
11// callable...
12
13#ifndef HILAPP
14
15#if defined(CUDA)
16
17#include <curand_kernel.h>
18
19using gpurandState = curandState_t;
20#define gpurand_init curand_init
21#define gpurand_uniform curand_uniform
22#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
23#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
24#define gpuGetLastError cudaGetLastError
25#define gpuGetErrorString cudaGetErrorString
26
27#elif defined(HIP)
28
29#include <hip/hip_runtime.h>
30#include <hiprand/hiprand_kernel.h>
31
32using gpurandState = hiprandState_t;
33#define gpurand_init hiprand_init
34#define gpurand_uniform hiprand_uniform
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
39
40#endif
41
42// // Save "constants" lattice size and volume here
43// __constant__ int64_t _d_volume;
44// // __constant__ int _d_size[NDIM];
45// __constant__ CoordinateVector _d_size;
46// #ifndef EVEN_SITES_FIRST
47// __constant__ int _d_nodesize[NDIM];
48// __constant__ int _d_nodemin[NDIM];
49// __constant__ int _d_nodefactor[NDIM];
50// #endif
51
52/* Random number generator */
53static gpurandState *gpurandstateptr;
54__constant__ gpurandState *d_gpurandstateptr;
55
56// check if rng on device is OK
57
59 return gpurandstateptr != nullptr;
60}
61
62/* Set seed on device */
63__global__ void seed_random_kernel(unsigned long long seed) {
64 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
65 // d_gpurandstateptr set now using memcpyToSymbol
66 // d_gpurandstateptr = state;
67 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
68}
69
70/* Set seed on device and host */
71void hila::initialize_device_rng(uint64_t seed) {
72 unsigned long n_blocks = (lattice->mynode.volume + N_threads - 1) / N_threads;
73
74#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
75 // If we have limited rng block number
76 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
77 n_blocks = GPU_RNG_THREAD_BLOCKS;
78 }
79
80 hila::out0 << "GPU random number generator initialized\n";
81 hila::out0 << "GPU random number thread blocks: " << n_blocks << " of size " << N_threads
82 << " threads\n";
83#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
84 hila::out0 << "GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
85#else
86 hila::out0 << "GPU random number generator initialized\n";
88 << "GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
89#endif
90
91 unsigned long long n_sites = n_blocks * N_threads;
92 unsigned long long myseed = seed + hila::myrank() * n_sites;
93
94 // allocate random state and copy the ptr to d_gpurandstateptr
95 gpuMalloc(&gpurandstateptr, n_sites * sizeof(gpurandState));
96 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
97 gpuMemcpyHostToDevice);
98
99#ifdef CUDA
100 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
101#else
102 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
103#endif
104 check_device_error("seed_random kernel");
105}
106
108 if (gpurandstateptr != nullptr) {
109 gpuFree(gpurandstateptr);
110 gpurandstateptr = nullptr;
111 // set d_gpurandstateptr <- nullptr.
112 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
113 gpuMemcpyHostToDevice);
114
115 // good to purge the memory pool after releasing a large chunk
116 gpu_memory_pool_purge();
117 }
118}
119
120/* Generate random numbers on device or host */
121__device__ __host__ double hila::random() {
122#ifdef _GPU_DEVICE_COMPILE_
123 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
124 return gpurand_uniform(&d_gpurandstateptr[x]);
125#else
126 return hila::host_random();
127#endif
128}
129
130
131///////////////////////////////////////////////////////////////////////////////////////
132// Setup the lattice struct on GPUs:
133// allocate neighbour and coordinate arrays
134// setup global variables in __constant__ memory
135
137 CoordinateVector *tmp;
138
139 /* Setup neighbour fields in all directions */
140 for (int d = 0; d < NDIRS; d++) {
141 // For normal boundaries
142 gpuMalloc(&(d_neighb[d]), lat.mynode.volume * sizeof(unsigned));
143 gpuMemcpy(d_neighb[d], lat.neighb[d], lat.mynode.volume * sizeof(unsigned),
144 gpuMemcpyHostToDevice);
145
146#ifdef SPECIAL_BOUNDARY_CONDITIONS
147 // For special boundaries
148 // TODO: check this really works now!
149 const unsigned *special_neighb =
150 lat.get_neighbour_array((Direction)d, hila::bc::ANTIPERIODIC);
151
152 if (special_neighb != lat.neighb[d]) {
153 gpuMalloc(&(d_neighb_special[d]), lat.mynode.volume * sizeof(unsigned));
154 gpuMemcpy(d_neighb_special[d], special_neighb,
155 lat.mynode.volume * sizeof(unsigned), gpuMemcpyHostToDevice);
156 } else {
157 d_neighb_special[d] = d_neighb[d];
158 }
159#endif
160 }
161
162#ifdef EVEN_SITES_FIRST
163 /* Setup the location field */
164 gpuMalloc(&(d_coordinates), lat.mynode.volume * sizeof(CoordinateVector));
165 tmp = (CoordinateVector *)memalloc(lat.mynode.volume * sizeof(CoordinateVector));
166 for (unsigned i = 0; i < lat.mynode.volume; i++)
167 tmp[i] = lat.coordinates(i);
168
169 gpuMemcpy(d_coordinates, tmp, lat.mynode.volume * sizeof(CoordinateVector),
170 gpuMemcpyHostToDevice);
171 free(tmp);
172
173
174#endif
175
176 // Other backend_lattice parameters
177 field_alloc_size = lat.mynode.field_alloc_size;
178
179 set_device_globals(lat);
180}
181
182#endif // not HILAPP
183
184// set some gobal variables, visible on GPUs
185// thus, hilapp needs to see this definition
186
187void backend_lattice_struct::set_device_globals(const lattice_struct &lat) {
188
189
190#ifdef EVEN_SITES_FIRST
191
192 gpuMemcpyToSymbol(_dev_coordinates, &d_coordinates, sizeof(CoordinateVector *), 0,
193 gpuMemcpyHostToDevice);
194#endif
195
196 gpuMemcpyToSymbol(_dev_field_alloc_size, &field_alloc_size, sizeof(unsigned), 0,
197 gpuMemcpyHostToDevice);
198
199 _d_volume = lat.l_volume;
200 _d_size = lat.l_size;
201
202#ifndef EVEN_SITES_FIRST
203
204 _d_nodesize = lat.mynode.size;
205 _d_nodemin = lat.mynode.min;
206 _d_nodefactor = lat.mynode.size_factor;
207
208 // foralldir(d) s[d] = lat.mynode.size[d];
209 // gpuMemcpyToSymbol(_d_nodesize, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
210
211 // foralldir(d) s[d] = lat.mynode.min[d];
212 // gpuMemcpyToSymbol(_d_nodemin, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
213
214 // foralldir(d) s[d] = lat.mynode.size_factor[d];
215 // gpuMemcpyToSymbol(_d_nodefactor, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
216
217#endif
218}
219
220#ifndef HILAPP
221// again, hilapp can skip this part
222
223void initialize_gpu(int rank, int device) {
224 int n_devices, my_device;
225
226 gpuGetDeviceCount(&n_devices);
227 check_device_error("Could not get device count");
228 // This assumes that each node has the same number of mpi ranks and GPUs
229 // TODO:generalize (if needed)
230 if (device > 0 && hila::number_of_nodes() == 1) {
231 if (device >= n_devices) {
232 hila::out0 << "-device " << device << ": too large device number, maximum "
233 << n_devices - 1 << " on this machine\n";
235 }
236
237 my_device = device;
238 } else {
239 my_device = rank % n_devices;
240 }
241
242
243 hila::out0 << "GPU devices accessible from node 0: " << n_devices << '\n';
244
245 // TODO: this only for node 0?
246 if (n_devices > 1 && rank < 6) {
247 hila::out << "GPU: MPI rank " << rank << " choosing device " << my_device << std::endl;
248 if (hila::number_of_nodes() > 6) {
249 hila::out0 << " + " << hila::number_of_nodes() - 6 << " more nodes\n";
250 }
251 }
252
253 gpuSetDevice(my_device);
254
255 // set gpu rng state to "off", to prevent accidental use
256 gpurandstateptr = nullptr;
257 // set d_gpurandstateptr <- nullptr.
258 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
259 gpuMemcpyHostToDevice);
260
261
262#if defined(CUDA_MALLOC_ASYNC)
263 // set memory pool
264 cudaMemPool_t mempool;
265 cudaDeviceGetDefaultMemPool(&mempool, my_device);
266 uint64_t threshold = UINT64_MAX;
267 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
268
269#endif
270}
271
272#ifdef CUDA
273
274#ifdef OPEN_MPI
275// here functions to inquire cuda-aware MPI defined
276#include "mpi-ext.h"
277#endif
278
279void gpu_device_info() {
280 if (hila::myrank() == 0) {
281 const int kb = 1024;
282 const int mb = kb * kb;
283
284 int driverVersion, rtVersion;
285 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
286 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
287 hila::out << "CUDA driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
288 hila::out << "CUDART_VERSION " << CUDART_VERSION << '\n';
289#if defined(CUDA_MALLOC_ASYNC)
290 if (CUDART_VERSION >= 11020) {
291 hila::out << "Using cudaMallocAsync() to allocate memory\n";
292 }
293#endif
294
295 cudaDeviceProp props;
296 int my_device;
297 GPU_CHECK(cudaGetDevice(&my_device));
298 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
299 hila::out << "Device on node rank 0 device " << my_device << ":\n";
300 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
301 << '\n';
302 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
303 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
304 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
305 hila::out << " Block registers: " << props.regsPerBlock << '\n';
306
307 hila::out << " Warp size: " << props.warpSize << '\n';
308 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
309 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
310 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
311 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
312 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
313
314 hila::out << "Thread block size used: " << N_threads << '\n';
315
316// Following should be OK in open MPI
317#ifdef OPEN_MPI
318#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
319 hila::out << "OpenMPI library supports CUDA-Aware MPI\n";
320 if (MPIX_Query_cuda_support() == 1)
321 hila::out << " Runtime library supports CUDA-Aware MPI\n";
322 else {
323 hila::out << " Runtime library does not support CUDA-Aware MPI!\n";
324#if defined(GPU_AWARE_MPI)
325 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
326#endif
327 }
328#else
329 hila::out << "OpenMPI library does not support CUDA-Aware MPI\n";
330#if defined(GPU_AWARE_MPI)
331 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
332#endif
333#endif // MPIX
334#endif // OPEN_MPI
335 }
336}
337#endif
338
339#ifdef HIP
340
341void gpu_device_info() {
342 if (hila::myrank() == 0) {
343 const int kb = 1024;
344 const int mb = kb * kb;
345
346 int driverVersion, rtVersion;
347 GPU_CHECK(hipDriverGetVersion(&driverVersion));
348 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
349 hila::out << "HIP driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
350
351 hipDeviceProp_t props;
352 int my_device;
353 GPU_CHECK(hipGetDevice(&my_device));
354 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
355 hila::out << "Device on node rank 0 device " << my_device << ":\n";
356 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
357 << '\n';
358 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
359 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
360 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
361 hila::out << " Block registers: " << props.regsPerBlock << '\n';
362
363 hila::out << " Warp size: " << props.warpSize << '\n';
364 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
365 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
366 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
367 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
368 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
369 hila::out << "Thread block size used: " << N_threads << '\n';
370 }
371}
372
373#endif
374
375void gpu_exit_on_error(const char *msg, const char *file, int line) {
376 gpuError code = gpuGetLastError();
377 if (gpuSuccess != code) {
378 hila::out << GPUTYPESTR << " error: " << msg << " in file " << file << " line " << line
379 << '\n';
380 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
381
383 }
384}
385
386void gpu_exit_on_error(gpuError code, const char *msg, const char *file, int line) {
387 if (gpuSuccess != code) {
388 hila::out << GPUTYPESTR << " error in command: " << msg << " in file " << file << " line "
389 << line << '\n';
390 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
391
393 }
394}
395
396#endif // not HILAPP
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
Definition lattice.h:213
constexpr unsigned NDIRS
Number of directions.
Definition coordinates.h:57
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
Definition hila_gpu.cpp:71
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
Definition hila_gpu.cpp:107
double random()
Real valued uniform random number generator.
Definition hila_gpu.cpp:121
int myrank()
rank of this node
Definition com_mpi.cpp:237
int number_of_nodes()
how many nodes there are
Definition com_mpi.cpp:248
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
Definition hila_gpu.cpp:58
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
#define GPU_RNG_THREAD_BLOCKS
Definition params.h:116
#define N_threads
General number of threads in a thread block.
Definition params.h:212
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.