6#include "plumbing/lattice.h"
8#include "plumbing/backend_gpu/defs.h"
17#include <curand_kernel.h>
19using gpurandState = curandState_t;
20#define gpurand_init curand_init
21#define gpurand_uniform curand_uniform
22#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
23#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
24#define gpuGetLastError cudaGetLastError
25#define gpuGetErrorString cudaGetErrorString
29#include <hip/hip_runtime.h>
30#include <hiprand/hiprand_kernel.h>
32using gpurandState = hiprandState_t;
33#define gpurand_init hiprand_init
34#define gpurand_uniform hiprand_uniform
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
53static gpurandState *gpurandstateptr;
54__constant__ gpurandState *d_gpurandstateptr;
59 return gpurandstateptr !=
nullptr;
63__global__
void seed_random_kernel(
unsigned long long seed) {
64 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
67 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
74#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
80 hila::out0 <<
"GPU random number generator initialized\n";
83#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
84 hila::out0 <<
"GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
86 hila::out0 <<
"GPU random number generator initialized\n";
88 <<
"GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
91 unsigned long long n_sites = n_blocks *
N_threads;
92 unsigned long long myseed = seed +
hila::myrank() * n_sites;
95 gpuMalloc(&gpurandstateptr, n_sites *
sizeof(gpurandState));
96 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
97 gpuMemcpyHostToDevice);
100 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
102 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(
N_threads), 0, 0, myseed);
104 check_device_error(
"seed_random kernel");
108 if (gpurandstateptr !=
nullptr) {
109 gpuFree(gpurandstateptr);
110 gpurandstateptr =
nullptr;
112 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
113 gpuMemcpyHostToDevice);
116 gpu_memory_pool_purge();
122#ifdef _GPU_DEVICE_COMPILE_
123 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
124 return gpurand_uniform(&d_gpurandstateptr[x]);
126 return hila::host_random();
140 for (
int d = 0; d <
NDIRS; d++) {
142 gpuMalloc(&(
d_neighb[d]), lat.mynode.volume *
sizeof(
unsigned));
143 gpuMemcpy(
d_neighb[d], lat.
neighb[d], lat.mynode.volume *
sizeof(
unsigned),
144 gpuMemcpyHostToDevice);
146#ifdef SPECIAL_BOUNDARY_CONDITIONS
149 const unsigned *special_neighb =
150 lat.get_neighbour_array((
Direction)d, hila::bc::ANTIPERIODIC);
152 if (special_neighb != lat.
neighb[d]) {
153 gpuMalloc(&(d_neighb_special[d]), lat.mynode.volume *
sizeof(
unsigned));
154 gpuMemcpy(d_neighb_special[d], special_neighb,
155 lat.mynode.volume *
sizeof(
unsigned), gpuMemcpyHostToDevice);
162#ifdef EVEN_SITES_FIRST
166 for (
unsigned i = 0; i < lat.mynode.volume; i++)
167 tmp[i] = lat.coordinates(i);
170 gpuMemcpyHostToDevice);
179 set_device_globals(lat);
187void backend_lattice_struct::set_device_globals(
const lattice_struct &lat) {
190#ifdef EVEN_SITES_FIRST
192 gpuMemcpyToSymbol(_dev_coordinates, &d_coordinates,
sizeof(
CoordinateVector *), 0,
193 gpuMemcpyHostToDevice);
196 gpuMemcpyToSymbol(_dev_field_alloc_size, &
field_alloc_size,
sizeof(
unsigned), 0,
197 gpuMemcpyHostToDevice);
199 _d_volume = lat.l_volume;
200 _d_size = lat.l_size;
202#ifndef EVEN_SITES_FIRST
204 _d_nodesize = lat.mynode.size;
205 _d_nodemin = lat.mynode.min;
206 _d_nodefactor = lat.mynode.size_factor;
223void initialize_gpu(
int rank,
int device) {
224 int n_devices, my_device;
226 gpuGetDeviceCount(&n_devices);
227 check_device_error(
"Could not get device count");
231 if (device >= n_devices) {
232 hila::out0 <<
"-device " << device <<
": too large device number, maximum "
233 << n_devices - 1 <<
" on this machine\n";
239 my_device = rank % n_devices;
243 hila::out0 <<
"GPU devices accessible from node 0: " << n_devices <<
'\n';
246 if (n_devices > 1 && rank < 6) {
247 hila::out <<
"GPU: MPI rank " << rank <<
" choosing device " << my_device << std::endl;
253 gpuSetDevice(my_device);
256 gpurandstateptr =
nullptr;
258 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
259 gpuMemcpyHostToDevice);
262#if defined(CUDA_MALLOC_ASYNC)
264 cudaMemPool_t mempool;
265 cudaDeviceGetDefaultMemPool(&mempool, my_device);
266 uint64_t threshold = UINT64_MAX;
267 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
279void gpu_device_info() {
282 const int mb = kb * kb;
284 int driverVersion, rtVersion;
285 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
286 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
287 hila::out <<
"CUDA driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
288 hila::out <<
"CUDART_VERSION " << CUDART_VERSION <<
'\n';
289#if defined(CUDA_MALLOC_ASYNC)
290 if (CUDART_VERSION >= 11020) {
291 hila::out <<
"Using cudaMallocAsync() to allocate memory\n";
295 cudaDeviceProp props;
297 GPU_CHECK(cudaGetDevice(&my_device));
298 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
299 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
300 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
302 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
303 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
304 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
305 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
307 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
308 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
309 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
310 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
311 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
312 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
318#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
319 hila::out <<
"OpenMPI library supports CUDA-Aware MPI\n";
320 if (MPIX_Query_cuda_support() == 1)
321 hila::out <<
" Runtime library supports CUDA-Aware MPI\n";
323 hila::out <<
" Runtime library does not support CUDA-Aware MPI!\n";
324#if defined(GPU_AWARE_MPI)
325 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
329 hila::out <<
"OpenMPI library does not support CUDA-Aware MPI\n";
330#if defined(GPU_AWARE_MPI)
331 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
341void gpu_device_info() {
344 const int mb = kb * kb;
346 int driverVersion, rtVersion;
347 GPU_CHECK(hipDriverGetVersion(&driverVersion));
348 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
349 hila::out <<
"HIP driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
351 hipDeviceProp_t props;
353 GPU_CHECK(hipGetDevice(&my_device));
354 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
355 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
356 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
358 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
359 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
360 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
361 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
363 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
364 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
365 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
366 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
367 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
368 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
375void gpu_exit_on_error(
const char *msg,
const char *file,
int line) {
376 gpuError code = gpuGetLastError();
377 if (gpuSuccess != code) {
378 hila::out << GPUTYPESTR <<
" error: " << msg <<
" in file " << file <<
" line " << line
380 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
386void gpu_exit_on_error(gpuError code,
const char *msg,
const char *file,
int line) {
387 if (gpuSuccess != code) {
388 hila::out << GPUTYPESTR <<
" error in command: " << msg <<
" in file " << file <<
" line "
390 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
constexpr unsigned NDIRS
Number of directions.
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
double random()
Real valued uniform random number generator.
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
#define GPU_RNG_THREAD_BLOCKS
#define N_threads
General number of threads in a thread block.
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.
unsigned field_alloc_size