6#include "plumbing/lattice.h"
8#include "plumbing/backend_gpu/defs.h"
17#include <curand_kernel.h>
19using gpurandState = curandState_t;
20#define gpurand_init curand_init
21#define gpurand_uniform curand_uniform
22#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
23#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
24#define gpuGetLastError cudaGetLastError
25#define gpuGetErrorString cudaGetErrorString
29#include <hip/hip_runtime.h>
30#include <hiprand/hiprand_kernel.h>
32using gpurandState = hiprandState_t;
33#define gpurand_init hiprand_init
34#define gpurand_uniform hiprand_uniform
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
53static gpurandState *gpurandstateptr;
54__constant__ gpurandState *d_gpurandstateptr;
59 return gpurandstateptr !=
nullptr;
63__global__
void seed_random_kernel(
unsigned long long seed) {
64 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
67 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
74#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
80 hila::out0 <<
"GPU random number generator initialized\n";
83#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
84 hila::out0 <<
"GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
86 hila::out0 <<
"GPU random number generator initialized\n";
88 <<
"GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
91 unsigned long long n_sites = n_blocks *
N_threads;
92 unsigned long long myseed = seed +
hila::myrank() * n_sites;
95 gpuMalloc(&gpurandstateptr, n_sites *
sizeof(gpurandState));
96 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
97 gpuMemcpyHostToDevice);
100 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
102 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(
N_threads), 0, 0, myseed);
104 check_device_error(
"seed_random kernel");
108 if (gpurandstateptr !=
nullptr) {
109 gpuFree(gpurandstateptr);
110 gpurandstateptr =
nullptr;
112 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
113 gpuMemcpyHostToDevice);
116 gpu_memory_pool_purge();
122#ifdef _GPU_DEVICE_COMPILE_
123 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
124 return gpurand_uniform(&d_gpurandstateptr[x]);
126 return hila::host_random();
140 for (
int d = 0; d <
NDIRS; d++) {
142 gpuMalloc(&(
d_neighb[d]), lattice.mynode.volume() *
sizeof(
unsigned));
143 gpuMemcpy(
d_neighb[d], lattice.
neighb[d], lattice.mynode.volume() *
sizeof(
unsigned),
144 gpuMemcpyHostToDevice);
146#ifdef SPECIAL_BOUNDARY_CONDITIONS
149 const unsigned *special_neighb =
150 lattice.get_neighbour_array((
Direction)d, hila::bc::ANTIPERIODIC);
152 if (special_neighb != lattice.
neighb[d]) {
153 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() *
sizeof(
unsigned));
154 gpuMemcpy(d_neighb_special[d], special_neighb,
155 lattice.mynode.volume() *
sizeof(
unsigned), gpuMemcpyHostToDevice);
162#ifdef EVEN_SITES_FIRST
164 gpuMalloc(&(d_coordinates), lattice.mynode.volume() *
sizeof(
CoordinateVector));
166 for (
unsigned i = 0; i < lattice.mynode.volume(); i++)
167 tmp[i] = lattice.coordinates(i);
169 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() *
sizeof(
CoordinateVector),
170 gpuMemcpyHostToDevice);
177 set_lattice_globals(lattice);
186void backend_lattice_struct::set_lattice_globals(
lattice_struct &lattice) {
188 _d_volume = lattice.volume();
189 _d_size = lattice.size();
191#ifndef EVEN_SITES_FIRST
193 _d_nodesize = lattice.mynode.size;
194 _d_nodemin = lattice.mynode.min;
195 _d_nodefactor = lattice.mynode.size_factor;
212void initialize_gpu(
int rank,
int device) {
213 int n_devices, my_device;
215 gpuGetDeviceCount(&n_devices);
216 check_device_error(
"Could not get device count");
220 if (device >= n_devices) {
221 hila::out0 <<
"-device " << device <<
": too large device number, maximum "
222 << n_devices - 1 <<
" on this machine\n";
228 my_device = rank % n_devices;
232 hila::out0 <<
"GPU devices accessible from node 0: " << n_devices <<
'\n';
235 if (n_devices > 1 && rank < 6) {
236 hila::out <<
"GPU: MPI rank " << rank <<
" choosing device " << my_device << std::endl;
242 gpuSetDevice(my_device);
245 gpurandstateptr =
nullptr;
247 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
248 gpuMemcpyHostToDevice);
251#if defined(CUDA_MALLOC_ASYNC)
253 cudaMemPool_t mempool;
254 cudaDeviceGetDefaultMemPool(&mempool, my_device);
255 uint64_t threshold = UINT64_MAX;
256 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
268void gpu_device_info() {
271 const int mb = kb * kb;
273 int driverVersion, rtVersion;
274 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
275 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
276 hila::out <<
"CUDA driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
277 hila::out <<
"CUDART_VERSION " << CUDART_VERSION <<
'\n';
278#if defined(CUDA_MALLOC_ASYNC)
279 if (CUDART_VERSION >= 11020) {
280 hila::out <<
"Using cudaMallocAsync() to allocate memory\n";
284 cudaDeviceProp props;
286 GPU_CHECK(cudaGetDevice(&my_device));
287 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
288 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
289 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
291 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
292 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
293 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
294 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
296 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
297 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
298 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
299 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
300 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
301 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
307#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
308 hila::out <<
"OpenMPI library supports CUDA-Aware MPI\n";
309 if (MPIX_Query_cuda_support() == 1)
310 hila::out <<
" Runtime library supports CUDA-Aware MPI\n";
312 hila::out <<
" Runtime library does not support CUDA-Aware MPI!\n";
313#if defined(GPU_AWARE_MPI)
314 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
318 hila::out <<
"OpenMPI library does not support CUDA-Aware MPI\n";
319#if defined(GPU_AWARE_MPI)
320 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
330void gpu_device_info() {
333 const int mb = kb * kb;
335 int driverVersion, rtVersion;
336 GPU_CHECK(hipDriverGetVersion(&driverVersion));
337 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
338 hila::out <<
"HIP driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
340 hipDeviceProp_t props;
342 GPU_CHECK(hipGetDevice(&my_device));
343 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
344 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
345 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
347 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
348 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
349 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
350 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
352 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
353 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
354 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
355 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
356 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
357 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
364void gpu_exit_on_error(
const char *msg,
const char *file,
int line) {
365 gpuError code = gpuGetLastError();
366 if (gpuSuccess != code) {
367 hila::out << GPUTYPESTR <<
" error: " << msg <<
" in file " << file <<
" line " << line
369 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
375void gpu_exit_on_error(gpuError code,
const char *msg,
const char *file,
int line) {
376 if (gpuSuccess != code) {
377 hila::out << GPUTYPESTR <<
" error in command: " << msg <<
" in file " << file <<
" line "
379 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
constexpr unsigned NDIRS
Number of directions.
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
double random()
Real valued uniform random number generator.
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
#define GPU_RNG_THREAD_BLOCKS
#define N_threads
General number of threads in a thread block.
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.
unsigned field_alloc_size