3#include "plumbing/lattice.h"
5#include "plumbing/backend_gpu/defs.h"
14#include <curand_kernel.h>
16using gpurandState = curandState_t;
17#define gpurand_init curand_init
18#define gpurand_uniform curand_uniform
19#define gpuMemcpyToSymbol(a, b, size, c, dir) GPU_CHECK(cudaMemcpyToSymbol(a, b, size, c, dir))
20#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))
21#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))
22#define gpuGetLastError cudaGetLastError
23#define gpuGetErrorString cudaGetErrorString
27#include <hip/hip_runtime.h>
28#include <hiprand_kernel.h>
30using gpurandState = hiprandState_t;
31#define gpurand_init hiprand_init
32#define gpurand_uniform hiprand_uniform
33#define gpuMemcpyToSymbol(a, b, size, c, dir) \
34 GPU_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(a), b, size, c, dir))
35#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))
36#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))
37#define gpuGetLastError hipGetLastError
38#define gpuGetErrorString hipGetErrorString
43__constant__ int64_t _d_volume;
44__constant__
int _d_size[NDIM];
45#ifndef EVEN_SITES_FIRST
46__constant__
int _d_nodesize[NDIM];
47__constant__
int _d_nodemin[NDIM];
48__constant__
int _d_nodefactor[NDIM];
52static gpurandState *gpurandstateptr;
53__constant__ gpurandState *d_gpurandstateptr;
58 return gpurandstateptr !=
nullptr;
62__global__
void seed_random_kernel(
unsigned long long seed) {
63 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
66 gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);
71 unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;
73#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0
75 if (GPU_RNG_THREAD_BLOCKS < n_blocks) {
76 n_blocks = GPU_RNG_THREAD_BLOCKS;
79 hila::out0 <<
"GPU random number generator initialized\n";
80 hila::out0 <<
"GPU random number thread blocks: " << n_blocks <<
" of size " << N_threads
82#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0
83 hila::out0 <<
"GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";
85 hila::out0 <<
"GPU random number generator initialized\n";
87 <<
"GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";
90 unsigned long long n_sites = n_blocks * N_threads;
91 unsigned long long myseed = seed +
hila::myrank() * n_sites;
94 gpuMalloc(&gpurandstateptr, n_sites *
sizeof(gpurandState));
95 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
96 gpuMemcpyHostToDevice);
99 seed_random_kernel<<<n_blocks, N_threads>>>(myseed);
101 hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);
103 check_device_error(
"seed_random kernel");
107 if (gpurandstateptr !=
nullptr) {
108 gpuFree(gpurandstateptr);
109 gpurandstateptr =
nullptr;
111 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
112 gpuMemcpyHostToDevice);
115 gpu_memory_pool_purge();
121#ifdef __GPU_DEVICE_COMPILE__
122 unsigned x = threadIdx.x + blockIdx.x * blockDim.x;
123 return gpurand_uniform(&d_gpurandstateptr[x]);
125 return hila::host_random();
130__device__ __host__
int loop_lattice_size(
Direction dir) {
131#ifdef __GPU_DEVICE_COMPILE__
134 return lattice.size(dir);
138#ifdef __GPU_DEVICE_COMPILE__
143 return lattice.size();
146__device__ __host__ int64_t loop_lattice_volume(
void) {
147#ifdef __GPU_DEVICE_COMPILE__
150 return lattice.volume();
154#ifndef EVEN_SITES_FIRST
156__device__
const CoordinateVector backend_lattice_struct::coordinates(
unsigned idx)
const {
161 for (
int d = 0; d < NDIM - 1; ++d) {
162 ndiv = vdiv / _d_nodesize[d];
163 c[d] = vdiv - ndiv * _d_nodesize[d] + _d_nodemin[d];
166 c[NDIM - 1] = vdiv + _d_nodemin[NDIM - 1];
171__device__
int backend_lattice_struct::coordinate(
unsigned idx,
Direction dir)
const {
172 return (idx / _d_nodefactor[dir]) % _d_nodesize[dir] + _d_nodemin[dir];
182 for (
int d = 0; d <
NDIRS; d++) {
184 gpuMalloc(&(
d_neighb[d]), lattice.mynode.volume() *
sizeof(
unsigned));
185 gpuMemcpy(
d_neighb[d], lattice.
neighb[d], lattice.mynode.volume() *
sizeof(
unsigned),
186 gpuMemcpyHostToDevice);
188#ifdef SPECIAL_BOUNDARY_CONDITIONS
191 const unsigned *special_neighb =
192 lattice.get_neighbour_array((
Direction)d, hila::bc::ANTIPERIODIC);
194 if (special_neighb != lattice.
neighb[d]) {
195 gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() *
sizeof(
unsigned));
196 gpuMemcpy(d_neighb_special[d], special_neighb,
197 lattice.mynode.volume() *
sizeof(
unsigned), gpuMemcpyHostToDevice);
204#ifdef EVEN_SITES_FIRST
206 gpuMalloc(&(d_coordinates), lattice.mynode.volume() *
sizeof(
CoordinateVector));
208 for (
unsigned i = 0; i < lattice.mynode.volume(); i++)
209 tmp[i] = lattice.coordinates(i);
211 gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() *
sizeof(
CoordinateVector),
212 gpuMemcpyHostToDevice);
219 int64_t v = lattice.volume();
220 gpuMemcpyToSymbol(_d_volume, &v,
sizeof(int64_t), 0, gpuMemcpyHostToDevice);
223 gpuMemcpyToSymbol(_d_size, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
225#ifndef EVEN_SITES_FIRST
226 foralldir(d) s[d] = lattice.mynode.size[d];
227 gpuMemcpyToSymbol(_d_nodesize, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
229 foralldir(d) s[d] = lattice.mynode.min[d];
230 gpuMemcpyToSymbol(_d_nodemin, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
232 foralldir(d) s[d] = lattice.mynode.size_factor[d];
233 gpuMemcpyToSymbol(_d_nodefactor, s,
sizeof(
int) * NDIM, 0, gpuMemcpyHostToDevice);
238void initialize_gpu(
int rank,
int device) {
239 int n_devices, my_device;
241 gpuGetDeviceCount(&n_devices);
242 check_device_error(
"Could not get device count");
246 if (device >= n_devices) {
247 hila::out0 <<
"-device " << device <<
": too large device number, maximum "
248 << n_devices - 1 <<
" on this machine\n";
254 my_device = rank % n_devices;
258 hila::out0 <<
"GPU devices accessible from node 0: " << n_devices <<
'\n';
261 if (n_devices > 1 && rank < 6) {
262 hila::out <<
"GPU: MPI rank " << rank <<
" choosing device " << my_device << std::endl;
268 gpuSetDevice(my_device);
271 gpurandstateptr =
nullptr;
273 gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr,
sizeof(gpurandState *), 0,
274 gpuMemcpyHostToDevice);
277#if defined(CUDA_MALLOC_ASYNC)
279 cudaMemPool_t mempool;
280 cudaDeviceGetDefaultMemPool(&mempool, my_device);
281 uint64_t threshold = UINT64_MAX;
282 cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
294void gpu_device_info() {
297 const int mb = kb * kb;
299 int driverVersion, rtVersion;
300 GPU_CHECK(cudaDriverGetVersion(&driverVersion));
301 GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));
302 hila::out <<
"CUDA driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
303 hila::out <<
"CUDART_VERSION " << CUDART_VERSION <<
'\n';
304#if defined(CUDA_MALLOC_ASYNC)
305 if (CUDART_VERSION >= 11020) {
306 hila::out <<
"Using cudaMallocAsync() to allocate memory\n";
310 cudaDeviceProp props;
312 GPU_CHECK(cudaGetDevice(&my_device));
313 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
314 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
315 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
317 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
318 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
319 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
320 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
322 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
323 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
324 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
325 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
326 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
327 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
329 hila::out <<
"Threads in use: " << N_threads <<
'\n';
333#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
334 hila::out <<
"OpenMPI library supports CUDA-Aware MPI\n";
335 if (MPIX_Query_cuda_support() == 1)
336 hila::out <<
" Runtime library supports CUDA-Aware MPI\n";
338 hila::out <<
" Runtime library does not support CUDA-Aware MPI!\n";
339#if defined(GPU_AWARE_MPI)
340 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
344 hila::out <<
"OpenMPI library does not support CUDA-Aware MPI\n";
345#if defined(GPU_AWARE_MPI)
346 hila::out <<
"GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";
356void gpu_device_info() {
359 const int mb = kb * kb;
361 int driverVersion, rtVersion;
362 GPU_CHECK(hipDriverGetVersion(&driverVersion));
363 GPU_CHECK(hipRuntimeGetVersion(&rtVersion));
364 hila::out <<
"HIP driver version: " << driverVersion <<
", runtime " << rtVersion <<
'\n';
366 hipDeviceProp_t props;
368 GPU_CHECK(hipGetDevice(&my_device));
369 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
370 hila::out <<
"Device on node rank 0 device " << my_device <<
":\n";
371 hila::out <<
" " << props.name <<
" capability: " << props.major <<
"." << props.minor
373 hila::out <<
" Global memory: " << props.totalGlobalMem / mb <<
"MB" <<
'\n';
374 hila::out <<
" Shared memory: " << props.sharedMemPerBlock / kb <<
"kB" <<
'\n';
375 hila::out <<
" Constant memory: " << props.totalConstMem / kb <<
"kB" <<
'\n';
376 hila::out <<
" Block registers: " << props.regsPerBlock <<
'\n';
378 hila::out <<
" Warp size: " << props.warpSize <<
'\n';
379 hila::out <<
" Threads per block: " << props.maxThreadsPerBlock <<
'\n';
380 hila::out <<
" Max block dimensions: [ " << props.maxThreadsDim[0] <<
", "
381 << props.maxThreadsDim[1] <<
", " << props.maxThreadsDim[2] <<
" ]" <<
'\n';
382 hila::out <<
" Max grid dimensions: [ " << props.maxGridSize[0] <<
", "
383 << props.maxGridSize[1] <<
", " << props.maxGridSize[2] <<
" ]" <<
'\n';
384 hila::out <<
"Threads in use: " << N_threads <<
'\n';
390void gpu_exit_on_error(
const char *msg,
const char *file,
int line) {
391 gpuError code = gpuGetLastError();
392 if (gpuSuccess != code) {
393 hila::out << GPUTYPESTR <<
" error: " << msg <<
" in file " << file <<
" line " << line
395 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
401void gpu_exit_on_error(gpuError code,
const char *msg,
const char *file,
int line) {
402 if (gpuSuccess != code) {
403 hila::out << GPUTYPESTR <<
" error in command: " << msg <<
" in file " << file <<
" line "
405 hila::out << GPUTYPESTR <<
" error string: " << gpuGetErrorString(code) <<
"\n";
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
#define foralldir(d)
Macro to loop over (all) Direction(s)
constexpr unsigned NDIRS
Number of directions.
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
int myrank()
rank of this node
int number_of_nodes()
how many nodes there are
std::ostream out
this is our default output file stream
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
double random()
Real valued uniform random number generator.
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)
void setup(lattice_struct *lattice)
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.
unsigned field_alloc_size