HILA
Loading...
Searching...
No Matches
hila_gpu.cpp
1
2// Define below to deactivate "extern" in global var defs
3#define IN_HILA_GPU
4
5#include "plumbing/defs.h"
6#include "plumbing/lattice.h"
7#include "plumbing/field.h"
8#include "plumbing/backend_gpu/defs.h"
9
10// hilapp needs to transform the include files above, to make them __device__
11// callable...
12
13#ifndef HILAPP
14
15#if defined(CUDA)
16
17#include <curand_kernel.h>
18
19using gpurandState = curandState_t;
20#define gpurand_init curand_init
21#define gpurand_uniform curand_uniform
22#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
23#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
24#define gpuGetLastError cudaGetLastError
25#define gpuGetErrorString cudaGetErrorString
26
27#elif defined(HIP)
28
29#include <hip/hip_runtime.h>
30#include <hiprand/hiprand_kernel.h>
31
32using gpurandState = hiprandState_t;
33#define gpurand_init hiprand_init
34#define gpurand_uniform hiprand_uniform
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
39
40#endif
41
42// // Save "constants" lattice size and volume here
43// __constant__ int64_t _d_volume;
44// // __constant__ int _d_size[NDIM];
45// __constant__ CoordinateVector _d_size;
46// #ifndef EVEN_SITES_FIRST
47// __constant__ int _d_nodesize[NDIM];
48// __constant__ int _d_nodemin[NDIM];
49// __constant__ int _d_nodefactor[NDIM];
50// #endif
51
52/* Random number generator */
53static gpurandState *gpurandstateptr;
54__constant__ gpurandState *d_gpurandstateptr;
55
56// check if rng on device is OK
57
59 return gpurandstateptr != nullptr;
60}
61
62/* Set seed on device */
63__global__ void seed_random_kernel(unsigned long long seed) {
64 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
65 // d_gpurandstateptr set now using memcpyToSymbol
66 // d_gpurandstateptr = state;
67 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
68}
69
70/* Set seed on device and host */
71void hila::initialize_device_rng(uint64_t seed) {
72 unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;
73
74#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
75 // If we have limited rng block number
76 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
77 n_blocks = GPU_RNG_THREAD_BLOCKS;
78 }
79
80 hila::out0 << "GPU random number generator initialized\n";
81 hila::out0 << "GPU random number thread blocks: " << n_blocks << " of size " << N_threads
82 << " threads\n";
83#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
84 hila::out0 << "GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
85#else
86 hila::out0 << "GPU random number generator initialized\n";
88 << "GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
89#endif
90
91 unsigned long long n_sites = n_blocks * N_threads;
92 unsigned long long myseed = seed + hila::myrank() * n_sites;
93
94 // allocate random state and copy the ptr to d_gpurandstateptr
95 gpuMalloc(&gpurandstateptr, n_sites * sizeof(gpurandState));
96 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
97 gpuMemcpyHostToDevice);
98
99#ifdef CUDA
100 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
101#else
102 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
103#endif
104 check_device_error("seed_random kernel");
105}
106
108 if (gpurandstateptr != nullptr) {
109 gpuFree(gpurandstateptr);
110 gpurandstateptr = nullptr;
111 // set d_gpurandstateptr <- nullptr.
112 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
113 gpuMemcpyHostToDevice);
114
115 // good to purge the memory pool after releasing a large chunk
116 gpu_memory_pool_purge();
117 }
118}
119
120/* Generate random numbers on device or host */
121__device__ __host__ double hila::random() {
122#ifdef _GPU_DEVICE_COMPILE_
123 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
124 return gpurand_uniform(&d_gpurandstateptr[x]);
125#else
126 return hila::host_random();
127#endif
128}
129
130
131///////////////////////////////////////////////////////////////////////////////////////
132// Setup the lattice struct on GPUs:
133// allocate neighbour and coordinate arrays
134// setup global variables in __constant__ memory
135
137 CoordinateVector *tmp;
138
139 /* Setup neighbour fields in all directions */
140 for (int d = 0; d < NDIRS; d++) {
141 // For normal boundaries
142 gpuMalloc(&(d_neighb[d]), lattice.mynode.volume() * sizeof(unsigned));
143 gpuMemcpy(d_neighb[d], lattice.neighb[d], lattice.mynode.volume() * sizeof(unsigned),
144 gpuMemcpyHostToDevice);
145
146#ifdef SPECIAL_BOUNDARY_CONDITIONS
147 // For special boundaries
148 // TODO: check this really works now!
149 const unsigned *special_neighb =
150 lattice.get_neighbour_array((Direction)d, hila::bc::ANTIPERIODIC);
151
152 if (special_neighb != lattice.neighb[d]) {
153 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() * sizeof(unsigned));
154 gpuMemcpy(d_neighb_special[d], special_neighb,
155 lattice.mynode.volume() * sizeof(unsigned), gpuMemcpyHostToDevice);
156 } else {
157 d_neighb_special[d] = d_neighb[d];
158 }
159#endif
160 }
161
162#ifdef EVEN_SITES_FIRST
163 /* Setup the location field */
164 gpuMalloc(&(d_coordinates), lattice.mynode.volume() * sizeof(CoordinateVector));
165 tmp = (CoordinateVector *)memalloc(lattice.mynode.volume() * sizeof(CoordinateVector));
166 for (unsigned i = 0; i < lattice.mynode.volume(); i++)
167 tmp[i] = lattice.coordinates(i);
168
169 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() * sizeof(CoordinateVector),
170 gpuMemcpyHostToDevice);
171 free(tmp);
172#endif
173
174 // Other backend_lattice parameters
175 field_alloc_size = lattice.field_alloc_size();
176
177 set_lattice_globals(lattice);
178
179}
180
181#endif // not HILAPP
182
183// set some gobal variables, visible on GPUs
184// thus, hilapp needs to see this definition
185
186void backend_lattice_struct::set_lattice_globals(lattice_struct &lattice) {
187
188 _d_volume = lattice.volume();
189 _d_size = lattice.size();
190
191#ifndef EVEN_SITES_FIRST
192
193 _d_nodesize = lattice.mynode.size;
194 _d_nodemin = lattice.mynode.min;
195 _d_nodefactor = lattice.mynode.size_factor;
196
197 // foralldir(d) s[d] = lattice.mynode.size[d];
198 // gpuMemcpyToSymbol(_d_nodesize, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
199
200 // foralldir(d) s[d] = lattice.mynode.min[d];
201 // gpuMemcpyToSymbol(_d_nodemin, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
202
203 // foralldir(d) s[d] = lattice.mynode.size_factor[d];
204 // gpuMemcpyToSymbol(_d_nodefactor, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);
205
206#endif
207}
208
209#ifndef HILAPP
210// again, hilapp can skip this part
211
212void initialize_gpu(int rank, int device) {
213 int n_devices, my_device;
214
215 gpuGetDeviceCount(&n_devices);
216 check_device_error("Could not get device count");
217 // This assumes that each node has the same number of mpi ranks and GPUs
218 // TODO:generalize (if needed)
219 if (device > 0 && hila::number_of_nodes() == 1) {
220 if (device >= n_devices) {
221 hila::out0 << "-device " << device << ": too large device number, maximum "
222 << n_devices - 1 << " on this machine\n";
224 }
225
226 my_device = device;
227 } else {
228 my_device = rank % n_devices;
229 }
230
231
232 hila::out0 << "GPU devices accessible from node 0: " << n_devices << '\n';
233
234 // TODO: this only for node 0?
235 if (n_devices > 1 && rank < 6) {
236 hila::out << "GPU: MPI rank " << rank << " choosing device " << my_device << std::endl;
237 if (hila::number_of_nodes() > 6) {
238 hila::out0 << " + " << hila::number_of_nodes() - 6 << " more nodes\n";
239 }
240 }
241
242 gpuSetDevice(my_device);
243
244 // set gpu rng state to "off", to prevent accidental use
245 gpurandstateptr = nullptr;
246 // set d_gpurandstateptr <- nullptr.
247 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,
248 gpuMemcpyHostToDevice);
249
250
251#if defined(CUDA_MALLOC_ASYNC)
252 // set memory pool
253 cudaMemPool_t mempool;
254 cudaDeviceGetDefaultMemPool(&mempool, my_device);
255 uint64_t threshold = UINT64_MAX;
256 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
257
258#endif
259}
260
261#ifdef CUDA
262
263#ifdef OPEN_MPI
264// here functions to inquire cuda-aware MPI defined
265#include "mpi-ext.h"
266#endif
267
268void gpu_device_info() {
269 if (hila::myrank() == 0) {
270 const int kb = 1024;
271 const int mb = kb * kb;
272
273 int driverVersion, rtVersion;
274 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
275 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
276 hila::out << "CUDA driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
277 hila::out << "CUDART_VERSION " << CUDART_VERSION << '\n';
278#if defined(CUDA_MALLOC_ASYNC)
279 if (CUDART_VERSION >= 11020) {
280 hila::out << "Using cudaMallocAsync() to allocate memory\n";
281 }
282#endif
283
284 cudaDeviceProp props;
285 int my_device;
286 GPU_CHECK(cudaGetDevice(&my_device));
287 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
288 hila::out << "Device on node rank 0 device " << my_device << ":\n";
289 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
290 << '\n';
291 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
292 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
293 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
294 hila::out << " Block registers: " << props.regsPerBlock << '\n';
295
296 hila::out << " Warp size: " << props.warpSize << '\n';
297 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
298 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
299 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
300 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
301 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
302
303 hila::out << "Thread block size used: " << N_threads << '\n';
304
305// Following should be OK in open MPI
306#ifdef OPEN_MPI
307#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
308 hila::out << "OpenMPI library supports CUDA-Aware MPI\n";
309 if (MPIX_Query_cuda_support() == 1)
310 hila::out << " Runtime library supports CUDA-Aware MPI\n";
311 else {
312 hila::out << " Runtime library does not support CUDA-Aware MPI!\n";
313#if defined(GPU_AWARE_MPI)
314 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
315#endif
316 }
317#else
318 hila::out << "OpenMPI library does not support CUDA-Aware MPI\n";
319#if defined(GPU_AWARE_MPI)
320 hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
321#endif
322#endif // MPIX
323#endif // OPEN_MPI
324 }
325}
326#endif
327
328#ifdef HIP
329
330void gpu_device_info() {
331 if (hila::myrank() == 0) {
332 const int kb = 1024;
333 const int mb = kb * kb;
334
335 int driverVersion, rtVersion;
336 GPU_CHECK(hipDriverGetVersion(&driverVersion));
337 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
338 hila::out << "HIP driver version: " << driverVersion << ", runtime " << rtVersion << '\n';
339
340 hipDeviceProp_t props;
341 int my_device;
342 GPU_CHECK(hipGetDevice(&my_device));
343 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
344 hila::out << "Device on node rank 0 device " << my_device << ":\n";
345 hila::out << " " << props.name << " capability: " << props.major << "." << props.minor
346 << '\n';
347 hila::out << " Global memory: " << props.totalGlobalMem / mb << "MB" << '\n';
348 hila::out << " Shared memory: " << props.sharedMemPerBlock / kb << "kB" << '\n';
349 hila::out << " Constant memory: " << props.totalConstMem / kb << "kB" << '\n';
350 hila::out << " Block registers: " << props.regsPerBlock << '\n';
351
352 hila::out << " Warp size: " << props.warpSize << '\n';
353 hila::out << " Threads per block: " << props.maxThreadsPerBlock << '\n';
354 hila::out << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
355 << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';
356 hila::out << " Max grid dimensions: [ " << props.maxGridSize[0] << ", "
357 << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';
358 hila::out << "Thread block size used: " << N_threads << '\n';
359 }
360}
361
362#endif
363
364void gpu_exit_on_error(const char *msg, const char *file, int line) {
365 gpuError code = gpuGetLastError();
366 if (gpuSuccess != code) {
367 hila::out << GPUTYPESTR << " error: " << msg << " in file " << file << " line " << line
368 << '\n';
369 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
370
372 }
373}
374
375void gpu_exit_on_error(gpuError code, const char *msg, const char *file, int line) {
376 if (gpuSuccess != code) {
377 hila::out << GPUTYPESTR << " error in command: " << msg << " in file " << file << " line "
378 << line << '\n';
379 hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";
380
382 }
383}
384
385#endif // not HILAPP
386
387
388
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
Definition lattice.h:203
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
constexpr unsigned NDIRS
Number of directions.
Definition coordinates.h:57
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
Definition hila_gpu.cpp:71
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
Definition hila_gpu.cpp:107
double random()
Real valued uniform random number generator.
Definition hila_gpu.cpp:121
int myrank()
rank of this node
Definition com_mpi.cpp:234
int number_of_nodes()
how many nodes there are
Definition com_mpi.cpp:245
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
Definition hila_gpu.cpp:58
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
#define GPU_RNG_THREAD_BLOCKS
Definition params.h:115
#define N_threads
General number of threads in a thread block.
Definition params.h:189
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.