HILA/hila__gpu_8cpp_source.html

#include "plumbing/defs.h"

#include "plumbing/lattice.h"

#include "plumbing/field.h"

#include "plumbing/backend_gpu/defs.h"


// hilapp needs to transform the include files above, to make them __device__

// callable...


#ifndef HILAPP


#if defined(CUDA)


#include <curand_kernel.h>


using gpurandState = curandState_t;

#define gpurand_init curand_init

#define gpurand_uniform curand_uniform

#define gpuGetDeviceCount(a) GPU_CHECK(cudaGetDeviceCount(a))

#define gpuSetDevice(dev) GPU_CHECK(cudaSetDevice(dev))

#define gpuGetLastError cudaGetLastError

#define gpuGetErrorString cudaGetErrorString


#elif defined(HIP)


#include <hip/hip_runtime.h>

#include <hiprand/hiprand_kernel.h>


using gpurandState = hiprandState_t;

#define gpurand_init hiprand_init

#define gpurand_uniform hiprand_uniform

#define gpuGetDeviceCount(a) GPU_CHECK(hipGetDeviceCount(a))

#define gpuSetDevice(dev) GPU_CHECK(hipSetDevice(dev))

#define gpuGetLastError hipGetLastError

#define gpuGetErrorString hipGetErrorString


#endif


// Save "constants" lattice size and volume here

__constant__ int64_t _d_volume;

// __constant__ int _d_size[NDIM];

__constant__ CoordinateVector _d_size;

#ifndef EVEN_SITES_FIRST

__constant__ int _d_nodesize[NDIM];

__constant__ int _d_nodemin[NDIM];

__constant__ int _d_nodefactor[NDIM];

#endif


/* Random number generator */

static gpurandState *gpurandstateptr;

__constant__ gpurandState *d_gpurandstateptr;


// check if rng on device is OK


bool hila::is_device_rng_on() {

    return gpurandstateptr != nullptr;

}


/* Set seed on device */

__global__ void seed_random_kernel(unsigned long long seed) {

    unsigned x = threadIdx.x + blockIdx.x * blockDim.x;

    //  d_gpurandstateptr set now using memcpyToSymbol

    //  d_gpurandstateptr = state;

    gpurand_init(seed + x, 0, 0, &d_gpurandstateptr[x]);

}


/* Set seed on device and host */

void hila::initialize_device_rng(uint64_t seed) {

    unsigned long n_blocks = (lattice.mynode.volume() + N_threads - 1) / N_threads;


#if defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS > 0

    // If we have limited rng block number

    if (GPU_RNG_THREAD_BLOCKS < n_blocks) {

        n_blocks = GPU_RNG_THREAD_BLOCKS;

    }


    hila::out0 << "GPU random number generator initialized\n";

    hila::out0 << "GPU random number thread blocks: " << n_blocks << " of size " << N_threads

               << " threads\n";

#elif defined(GPU_RNG_THREAD_BLOCKS) && GPU_RNG_THREAD_BLOCKS < 0

    hila::out0 << "GPU RANDOM NUMBERS DISABLED, GPU_RNG_THREAD_BLOCKS < 0\n";

#else

    hila::out0 << "GPU random number generator initialized\n";

    hila::out0

        << "GPU random numbers: using on generator/site (GPU_RNG_THREAD_BLOCKS = 0 or undefined)\n";

#endif


    unsigned long long n_sites = n_blocks * N_threads;

    unsigned long long myseed = seed + hila::myrank() * n_sites;


    // allocate random state and copy the ptr to d_gpurandstateptr

    gpuMalloc(&gpurandstateptr, n_sites * sizeof(gpurandState));

    gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,

                      gpuMemcpyHostToDevice);


#ifdef CUDA

    seed_random_kernel<<<n_blocks, N_threads>>>(myseed);

#else

    hipLaunchKernelGGL(seed_random_kernel, dim3(n_blocks), dim3(N_threads), 0, 0, myseed);

#endif

    check_device_error("seed_random kernel");

}


void hila::free_device_rng() {

    if (gpurandstateptr != nullptr) {

        gpuFree(gpurandstateptr);

        gpurandstateptr = nullptr;

        // set d_gpurandstateptr <- nullptr.

        gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,

                          gpuMemcpyHostToDevice);


        // good to purge the memory pool after releasing a large chunk

        gpu_memory_pool_purge();

    }

}


/* Generate random numbers on device or host */

__device__ __host__ double hila::random() {

#ifdef __GPU_DEVICE_COMPILE__

    unsigned x = threadIdx.x + blockIdx.x * blockDim.x;

    return gpurand_uniform(&d_gpurandstateptr[x]);

#else

    return hila::host_random();

#endif

}


// Then, define global functions loop_lattice_size() and _volume()

__device__ __host__ int loop_lattice_size(Direction dir) {

#ifdef __GPU_DEVICE_COMPILE__

    return _d_size[dir];

#else

    return lattice.size(dir);

#endif

}


__device__ __host__ CoordinateVector loop_lattice_size(void) {

#ifdef __GPU_DEVICE_COMPILE__

    // CoordinateVector v;

    // foralldir(d) v[d] = _d_size[d];

    // return v;

    return _d_size;

#else

    return lattice.size();

#endif

}


__device__ __host__ int64_t loop_lattice_volume(void) {

#ifdef __GPU_DEVICE_COMPILE__

    return _d_volume;

#else

    return lattice.volume();

#endif

}


#ifndef EVEN_SITES_FIRST


__device__ const CoordinateVector backend_lattice_struct::coordinates(unsigned idx) const {

    CoordinateVector c;

    unsigned vdiv, ndiv;


    vdiv = idx;

    for (int d = 0; d < NDIM - 1; ++d) {

        ndiv = vdiv / _d_nodesize[d];

        c[d] = vdiv - ndiv * _d_nodesize[d] + _d_nodemin[d];

        vdiv = ndiv;

    }

    c[NDIM - 1] = vdiv + _d_nodemin[NDIM - 1];


    return c;

}


__device__ int backend_lattice_struct::coordinate(unsigned idx, Direction dir) const {

    return (idx / _d_nodefactor[dir]) % _d_nodesize[dir] + _d_nodemin[dir];

}


#endif


void backend_lattice_struct::setup(lattice_struct &lattice) {

    CoordinateVector *tmp;


    /* Setup neighbour fields in all directions */

    for (int d = 0; d < NDIRS; d++) {

        // For normal boundaries

        gpuMalloc(&(d_neighb[d]), lattice.mynode.volume() * sizeof(unsigned));

        gpuMemcpy(d_neighb[d], lattice.neighb[d], lattice.mynode.volume() * sizeof(unsigned),

                  gpuMemcpyHostToDevice);


#ifdef SPECIAL_BOUNDARY_CONDITIONS

        // For special boundaries

        // TODO: check this really works now!

        const unsigned *special_neighb =

            lattice.get_neighbour_array((Direction)d, hila::bc::ANTIPERIODIC);


        if (special_neighb != lattice.neighb[d]) {

            gpuMalloc(&(d_neighb_special[d]), lattice.mynode.volume() * sizeof(unsigned));

            gpuMemcpy(d_neighb_special[d], special_neighb,

                      lattice.mynode.volume() * sizeof(unsigned), gpuMemcpyHostToDevice);

        } else {

            d_neighb_special[d] = d_neighb[d];

        }

#endif

    }


#ifdef EVEN_SITES_FIRST

    /* Setup the location field */

    gpuMalloc(&(d_coordinates), lattice.mynode.volume() * sizeof(CoordinateVector));

    tmp = (CoordinateVector *)memalloc(lattice.mynode.volume() * sizeof(CoordinateVector));

    for (unsigned i = 0; i < lattice.mynode.volume(); i++)

        tmp[i] = lattice.coordinates(i);


    gpuMemcpy(d_coordinates, tmp, lattice.mynode.volume() * sizeof(CoordinateVector),

              gpuMemcpyHostToDevice);

    free(tmp);

#endif


    // Other backend_lattice parameters

    field_alloc_size = lattice.field_alloc_size();


    int64_t v = lattice.volume();

    gpuMemcpyToSymbol(_d_volume, &v, sizeof(int64_t), 0, gpuMemcpyHostToDevice);

    int s[NDIM];

    foralldir(d) s[d] = lattice.size(d);

    gpuMemcpyToSymbol(_d_size, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);


#ifndef EVEN_SITES_FIRST

    foralldir(d) s[d] = lattice.mynode.size[d];

    gpuMemcpyToSymbol(_d_nodesize, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);


    foralldir(d) s[d] = lattice.mynode.min[d];

    gpuMemcpyToSymbol(_d_nodemin, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);


    foralldir(d) s[d] = lattice.mynode.size_factor[d];

    gpuMemcpyToSymbol(_d_nodefactor, s, sizeof(int) * NDIM, 0, gpuMemcpyHostToDevice);


#endif

}


void initialize_gpu(int rank, int device) {

    int n_devices, my_device;


    gpuGetDeviceCount(&n_devices);

    check_device_error("Could not get device count");

    // This assumes that each node has the same number of mpi ranks and GPUs

    // TODO:generalize (if needed)

    if (device > 0 && hila::number_of_nodes() == 1) {

        if (device >= n_devices) {

            hila::out0 << "-device " << device << ": too large device number, maximum "

                       << n_devices - 1 << " on this machine\n";

            hila::terminate(0);

        }


        my_device = device;

    } else {

        my_device = rank % n_devices;

    }


    hila::out0 << "GPU devices accessible from node 0: " << n_devices << '\n';


    // TODO: this only for node 0?

    if (n_devices > 1 && rank < 6) {

        hila::out << "GPU: MPI rank " << rank << " choosing device " << my_device << std::endl;

        if (hila::number_of_nodes() > 6) {

            hila::out0 << "  + " << hila::number_of_nodes() - 6 << " more nodes\n";

        }

    }


    gpuSetDevice(my_device);


    // set gpu rng state to "off", to prevent accidental use

    gpurandstateptr = nullptr;

    // set d_gpurandstateptr <- nullptr.

    gpuMemcpyToSymbol(d_gpurandstateptr, &gpurandstateptr, sizeof(gpurandState *), 0,

                      gpuMemcpyHostToDevice);


#if defined(CUDA_MALLOC_ASYNC)

    // set memory pool

    cudaMemPool_t mempool;

    cudaDeviceGetDefaultMemPool(&mempool, my_device);

    uint64_t threshold = UINT64_MAX;

    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);


#endif

}


#ifdef CUDA


#ifdef OPEN_MPI

// here functions to inquire cuda-aware MPI defined

#include "mpi-ext.h"

#endif


void gpu_device_info() {

    if (hila::myrank() == 0) {

        const int kb = 1024;

        const int mb = kb * kb;


        int driverVersion, rtVersion;

        GPU_CHECK(cudaDriverGetVersion(&driverVersion));

        GPU_CHECK(cudaRuntimeGetVersion(&rtVersion));

        hila::out << "CUDA driver version: " << driverVersion << ", runtime " << rtVersion << '\n';

        hila::out << "CUDART_VERSION " << CUDART_VERSION << '\n';

#if defined(CUDA_MALLOC_ASYNC)

        if (CUDART_VERSION >= 11020) {

            hila::out << "Using cudaMallocAsync() to allocate memory\n";

        }

#endif


        cudaDeviceProp props;

        int my_device;

        GPU_CHECK(cudaGetDevice(&my_device));

        GPU_CHECK(cudaGetDeviceProperties(&props, my_device));

        hila::out << "Device on node rank 0 device " << my_device << ":\n";

        hila::out << "  " << props.name << "  capability: " << props.major << "." << props.minor

                  << '\n';

        hila::out << "  Global memory:   " << props.totalGlobalMem / mb << "MB" << '\n';

        hila::out << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kB" << '\n';

        hila::out << "  Constant memory: " << props.totalConstMem / kb << "kB" << '\n';

        hila::out << "  Block registers: " << props.regsPerBlock << '\n';


        hila::out << "  Warp size:         " << props.warpSize << '\n';

        hila::out << "  Threads per block: " << props.maxThreadsPerBlock << '\n';

        hila::out << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", "

                  << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';

        hila::out << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", "

                  << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';


        hila::out << "Threads in use: " << N_threads << '\n';


// Following should be OK in open MPI

#ifdef OPEN_MPI

#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT

        hila::out << "OpenMPI library supports CUDA-Aware MPI\n";

        if (MPIX_Query_cuda_support() == 1)

            hila::out << "  Runtime library supports CUDA-Aware MPI\n";

        else {

            hila::out << "  Runtime library does not support CUDA-Aware MPI!\n";

#if defined(GPU_AWARE_MPI)

            hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";

#endif

        }

#else

        hila::out << "OpenMPI library does not support CUDA-Aware MPI\n";

#if defined(GPU_AWARE_MPI)

        hila::out << "GPU_AWARE_MPI is defined -- THIS MAY CRASH IN MPI\n";

#endif

#endif // MPIX

#endif // OPEN_MPI

    }

}

#endif


#ifdef HIP


void gpu_device_info() {

    if (hila::myrank() == 0) {

        const int kb = 1024;

        const int mb = kb * kb;


        int driverVersion, rtVersion;

        GPU_CHECK(hipDriverGetVersion(&driverVersion));

        GPU_CHECK(hipRuntimeGetVersion(&rtVersion));

        hila::out << "HIP driver version: " << driverVersion << ", runtime " << rtVersion << '\n';


        hipDeviceProp_t props;

        int my_device;

        GPU_CHECK(hipGetDevice(&my_device));

        GPU_CHECK(hipGetDeviceProperties(&props, my_device));

        hila::out << "Device on node rank 0 device " << my_device << ":\n";

        hila::out << "  " << props.name << "  capability: " << props.major << "." << props.minor

                  << '\n';

        hila::out << "  Global memory:   " << props.totalGlobalMem / mb << "MB" << '\n';

        hila::out << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kB" << '\n';

        hila::out << "  Constant memory: " << props.totalConstMem / kb << "kB" << '\n';

        hila::out << "  Block registers: " << props.regsPerBlock << '\n';


        hila::out << "  Warp size:         " << props.warpSize << '\n';

        hila::out << "  Threads per block: " << props.maxThreadsPerBlock << '\n';

        hila::out << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", "

                  << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << '\n';

        hila::out << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", "

                  << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << '\n';

        hila::out << "Threads in use: " << N_threads << '\n';

    }

}


#endif


void gpu_exit_on_error(const char *msg, const char *file, int line) {

    gpuError code = gpuGetLastError();

    if (gpuSuccess != code) {

        hila::out << GPUTYPESTR << " error: " << msg << " in file " << file << " line " << line

                  << '\n';

        hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";


        hila::terminate(0);

    }

}


void gpu_exit_on_error(gpuError code, const char *msg, const char *file, int line) {

    if (gpuSuccess != code) {

        hila::out << GPUTYPESTR << " error in command: " << msg << " in file " << file << " line "

                  << line << '\n';

        hila::out << GPUTYPESTR << " error string: " << gpuGetErrorString(code) << "\n";


        hila::terminate(0);

    }

}


#endif

CoordinateVector_t< int >

lattice_struct
Definition lattice.h:62

lattice_struct::neighb
unsigned *__restrict__ neighb[NDIRS]
Main neighbour index array.
Definition lattice.h:203

CoordinateVector
CoordinateVector_t< int > CoordinateVector
CoordinateVector alias for CoordinateVector_t.
Definition coordinates.h:487

foralldir
#define foralldir(d)
Macro to loop over (all) Direction(s)
Definition coordinates.h:78

NDIRS
constexpr unsigned NDIRS
Number of directions.
Definition coordinates.h:57

Direction
Direction
Enumerator for direction that assigns integer to direction to be interpreted as unit vector.
Definition coordinates.h:34

defs.h
This file defines all includes for HILA.

field.h
This files containts definitions for the Field class and the classes required to define it such as fi...

hila::initialize_device_rng
void initialize_device_rng(uint64_t seed)
Initialize device random number generator on GPUs, if application run on GPU platform....
Definition hila_gpu.cpp:68

hila::free_device_rng
void free_device_rng()
Free GPU RNG state, does nothing on non-GPU archs.
Definition hila_gpu.cpp:104

hila::myrank
int myrank()
rank of this node
Definition com_mpi.cpp:235

hila::number_of_nodes
int number_of_nodes()
how many nodes there are
Definition com_mpi.cpp:246

hila::out
std::ostream out
this is our default output file stream

hila::is_device_rng_on
bool is_device_rng_on()
Check if the RNG on GPU is allocated and ready to use.
Definition hila_gpu.cpp:55

hila::random
double random()
Real valued uniform random number generator.
Definition hila_gpu.cpp:118

hila::out0
std::ostream out0
This writes output only from main process (node 0)

hila::terminate
void terminate(int status)
Definition initialize.cpp:293

backend_lattice_struct::setup
void setup(lattice_struct *lattice)
Definition backend_cpu/lattice.h:11

backend_lattice_struct::d_neighb
unsigned * d_neighb[NDIRS]
Storage for the neighbour indexes. Stored on device.
Definition backend_gpu/lattice.h:8

backend_lattice_struct::field_alloc_size
unsigned field_alloc_size
Definition backend_gpu/lattice.h:17