3#include "plumbing/lattice.h"
5#include "plumbing/backend_gpu/defs.h"
14#include <curand_kernel.h>
16using gpurandState = curandState_t;
17#define gpurand_init curand_init
18#define gpurand_uniform curand_uniform
19#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
20#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
21#define gpuGetLastError cudaGetLastError
22#define gpuGetErrorString cudaGetErrorString
26#include <hip/hip_runtime.h>
27#include <hiprand/hiprand_kernel.h>
29using gpurandState = hiprandState_t;
30#define gpurand_init hiprand_init
31#define gpurand_uniform hiprand_uniform
32#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
33#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
34#define gpuGetLastError hipGetLastError
35#define gpuGetErrorString hipGetErrorString
40__constant__ int64_t _d_volume;
43#ifndef EVEN_SITES_FIRST
44__constant__
int _d_nodesize[NDIM];
45__constant__
int _d_nodemin[NDIM];
46__constant__
int _d_nodefactor[NDIM];
50static gpurandState *gpurandstateptr;
51__constant__ gpurandState *d_gpurandstateptr;
56 return gpurandstateptr !=
nullptr;
60__global__
void seed_random_kernel(
unsigned long long seed) {
61 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
64 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
69 unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;
71#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
73 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
74 n_blocks = GPU_RNG_THREAD_BLOCKS;
77 hila::out0 <<
"GPU random number generator initialized\n";
78 hila::out0 <<
"GPU random number thread blocks: " << n_blocks <<
" of size " << N_threads
80#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
81 hila::out0 <<
"GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
83 hila::out0 <<
"GPU random number generator initialized\n";
85 <<
"GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
88 unsigned long long n_sites = n_blocks * N_threads;
89 unsigned long long myseed = seed +
hila::myrank() * n_sites;
92 gpuMalloc(&gpurandstateptr, n_sites *
sizeof(gpurandState));
93 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
94 gpuMemcpyHostToDevice);
97 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
99 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
101 check_device_error(
"seed_random kernel");
105 if (gpurandstateptr !=
nullptr) {
106 gpuFree(gpurandstateptr);
107 gpurandstateptr =
nullptr;
109 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
110 gpuMemcpyHostToDevice);
113 gpu_memory_pool_purge();
119#ifdef __GPU_DEVICE_COMPILE__
120 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
121 return gpurand_uniform(&d_gpurandstateptr[x]);
123 return hila::host_random();
128__device__ __host__
int loop_lattice_size(
Direction dir) {
129#ifdef __GPU_DEVICE_COMPILE__
132 return lattice.size(dir);
137#ifdef __GPU_DEVICE_COMPILE__
143 return lattice.size();
147__device__ __host__ int64_t loop_lattice_volume(
void) {
148#ifdef __GPU_DEVICE_COMPILE__
151 return lattice.volume();
155#ifndef EVEN_SITES_FIRST
157__device__
const CoordinateVector backend_lattice_struct::coordinates(
unsigned idx)
const {
162 for (
int d = 0; d < NDIM - 1; ++d) {
163 ndiv = vdiv / _d_nodesize[d];
164 c[d] = vdiv - ndiv * _d_nodesize[d] + _d_nodemin[d];
167 c[NDIM - 1] = vdiv + _d_nodemin[NDIM - 1];
172__device__
int backend_lattice_struct::coordinate(
unsigned idx,
Direction dir)
const {
173 return (idx / _d_nodefactor[dir]) % _d_nodesize[dir] + _d_nodemin[dir];
183 for (
int d = 0; d <
NDIRS; d++) {
185 gpuMalloc(&(
d_neighb[d]), lattice.mynode.volume() *
sizeof(
unsigned));
186 gpuMemcpy(
d_neighb[d], lattice.
neighb[d], lattice.mynode.volume() *
sizeof(
unsigned),
187 gpuMemcpyHostToDevice);
189#ifdef SPECIAL_BOUNDARY_CONDITIONS
192 const unsigned *special_neighb =
193 lattice.get_neighbour_array((
Direction)d, hila::bc::ANTIPERIODIC);
195 if (special_neighb != lattice.
neighb[d]) {
196 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() *
sizeof(
unsigned));
197 gpuMemcpy(d_neighb_special[d], special_neighb,
198 lattice.mynode.volume() *
sizeof(
unsigned), gpuMemcpyHostToDevice);
205#ifdef EVEN_SITES_FIRST
207 gpuMalloc(&(d_coordinates), lattice.mynode.volume() *
sizeof(
CoordinateVector));
209 for (
unsigned i = 0; i < lattice.mynode.volume(); i++)
210 tmp[i] = lattice.coordinates(i);
212 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() *
sizeof(
CoordinateVector),
213 gpuMemcpyHostToDevice);
220 int64_t v = lattice.volume();
221 gpuMemcpyToSymbol(_d_volume, &v,
sizeof(int64_t), 0, gpuMemcpyHostToDevice);
224 gpuMemcpyToSymbol(_d_size, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
226#ifndef EVEN_SITES_FIRST
227 foralldir(d) s[d] = lattice.mynode.size[d];
228 gpuMemcpyToSymbol(_d_nodesize, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
230 foralldir(d) s[d] = lattice.mynode.min[d];
231 gpuMemcpyToSymbol(_d_nodemin, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
233 foralldir(d) s[d] = lattice.mynode.size_factor[d];
234 gpuMemcpyToSymbol(_d_nodefactor, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
239void initialize_gpu(
int rank,
int device) {
240 int n_devices, my_device;
242 gpuGetDeviceCount(&n_devices);
243 check_device_error(
"Could not get device count");
247 if (device >= n_devices) {
248 hila::out0 <<
"-device " << device <<
": too large device number, maximum "
249 << n_devices - 1 <<
" on this machine\n";
255 my_device = rank % n_devices;
259 hila::out0 <<
"GPU devices accessible from node 0: " << n_devices <<
'\n';
262 if (n_devices > 1 && rank < 6) {
263 hila::out <<
"GPU: MPI rank " << rank <<
" choosing device " << my_device << std::endl;
269 gpuSetDevice(my_device);
272 gpurandstateptr =
nullptr;
274 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
275 gpuMemcpyHostToDevice);
278#if defined(CUDA_MALLOC_ASYNC)
280 cudaMemPool_t mempool;
281 cudaDeviceGetDefaultMemPool(&mempool, my_device);
282 uint64_t threshold = UINT64_MAX;
283 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
295void gpu_device_info() {
298 const int mb = kb * kb;
300 int driverVersion, rtVersion;
301 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
302 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
303 hila::out <<
"CUDA driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
304 hila::out <<
"CUDART_VERSION " << CUDART_VERSION <<
'\n';
305#if defined(CUDA_MALLOC_ASYNC)
306 if (CUDART_VERSION >= 11020) {
307 hila::out <<
"Using cudaMallocAsync() to allocate memory\n";
311 cudaDeviceProp props;
313 GPU_CHECK(cudaGetDevice(&my_device));
314 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
315 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
316 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
318 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
319 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
320 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
321 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
323 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
324 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
325 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
326 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
327 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
328 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
330 hila::out <<
"Threads in use: " << N_threads <<
'\n';
334#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
335 hila::out <<
"OpenMPI library supports CUDA-Aware MPI\n";
336 if (MPIX_Query_cuda_support() == 1)
337 hila::out <<
" Runtime library supports CUDA-Aware MPI\n";
339 hila::out <<
" Runtime library does not support CUDA-Aware MPI!\n";
340#if defined(GPU_AWARE_MPI)
341 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
345 hila::out <<
"OpenMPI library does not support CUDA-Aware MPI\n";
346#if defined(GPU_AWARE_MPI)
347 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
357void gpu_device_info() {
360 const int mb = kb * kb;
362 int driverVersion, rtVersion;
363 GPU_CHECK(hipDriverGetVersion(&driverVersion));
364 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
365 hila::out <<
"HIP driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
367 hipDeviceProp_t props;
369 GPU_CHECK(hipGetDevice(&my_device));
370 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
371 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
372 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
374 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
375 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
376 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
377 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
379 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
380 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
381 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
382 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
383 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
384 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
385 hila::out <<
"Threads in use: " << N_threads <<
'\n';
391void gpu_exit_on_error(
const char *msg,
const char *file,
int line) {
392 gpuError code = gpuGetLastError();
393 if (gpuSuccess != code) {
394 hila::out << GPUTYPESTR <<
" error: " << msg <<
" in file " << file <<
" line " << line
396 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
402void gpu_exit_on_error(gpuError code,
const char *msg,
const char *file,
int line) {
403 if (gpuSuccess != code) {
404 hila::out << GPUTYPESTR <<
" error in command: " << msg <<
" in file " << file <<
" line "
406 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
#define foralldir(d)
Macro to loop over (all) Direction(s)
constexpr unsigned NDIRS
Number of directions.
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
double random()
Real valued uniform random number generator.
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.
unsigned field_alloc_size