25 #ifndef MXNET_COMMON_CUDA_UTILS_H_    26 #define MXNET_COMMON_CUDA_UTILS_H_    28 #include <dmlc/logging.h>    29 #include <dmlc/parameter.h>    30 #include <dmlc/optional.h>    31 #include <mshadow/base.h>    34 #ifdef __JETBRAINS_IDE__    39 #define __forceinline__    41 inline void __syncthreads() {}
    42 inline void __threadfence_block() {}
    43 template<
class T> 
inline T __clz(
const T val) { 
return val; }
    44 struct __cuda_fake_struct { 
int x; 
int y; 
int z; };
    45 extern __cuda_fake_struct blockDim;
    46 extern __cuda_fake_struct threadIdx;
    47 extern __cuda_fake_struct blockIdx;
    52 #include <cuda_runtime.h>    53 #include <cublas_v2.h>    67   case CUBLAS_STATUS_SUCCESS:
    68     return "CUBLAS_STATUS_SUCCESS";
    69   case CUBLAS_STATUS_NOT_INITIALIZED:
    70     return "CUBLAS_STATUS_NOT_INITIALIZED";
    71   case CUBLAS_STATUS_ALLOC_FAILED:
    72     return "CUBLAS_STATUS_ALLOC_FAILED";
    73   case CUBLAS_STATUS_INVALID_VALUE:
    74     return "CUBLAS_STATUS_INVALID_VALUE";
    75   case CUBLAS_STATUS_ARCH_MISMATCH:
    76     return "CUBLAS_STATUS_ARCH_MISMATCH";
    77   case CUBLAS_STATUS_MAPPING_ERROR:
    78     return "CUBLAS_STATUS_MAPPING_ERROR";
    79   case CUBLAS_STATUS_EXECUTION_FAILED:
    80     return "CUBLAS_STATUS_EXECUTION_FAILED";
    81   case CUBLAS_STATUS_INTERNAL_ERROR:
    82     return "CUBLAS_STATUS_INTERNAL_ERROR";
    83   case CUBLAS_STATUS_NOT_SUPPORTED:
    84     return "CUBLAS_STATUS_NOT_SUPPORTED";
    88   return "Unknown cuBLAS status";
    98   case CUSOLVER_STATUS_SUCCESS:
    99     return "CUSOLVER_STATUS_SUCCESS";
   100   case CUSOLVER_STATUS_NOT_INITIALIZED:
   101     return "CUSOLVER_STATUS_NOT_INITIALIZED";
   102   case CUSOLVER_STATUS_ALLOC_FAILED:
   103     return "CUSOLVER_STATUS_ALLOC_FAILED";
   104   case CUSOLVER_STATUS_INVALID_VALUE:
   105     return "CUSOLVER_STATUS_INVALID_VALUE";
   106   case CUSOLVER_STATUS_ARCH_MISMATCH:
   107     return "CUSOLVER_STATUS_ARCH_MISMATCH";
   108   case CUSOLVER_STATUS_EXECUTION_FAILED:
   109     return "CUSOLVER_STATUS_EXECUTION_FAILED";
   110   case CUSOLVER_STATUS_INTERNAL_ERROR:
   111     return "CUSOLVER_STATUS_INTERNAL_ERROR";
   112   case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
   113     return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
   117   return "Unknown cuSOLVER status";
   127   case CURAND_STATUS_SUCCESS:
   128     return "CURAND_STATUS_SUCCESS";
   129   case CURAND_STATUS_VERSION_MISMATCH:
   130     return "CURAND_STATUS_VERSION_MISMATCH";
   131   case CURAND_STATUS_NOT_INITIALIZED:
   132     return "CURAND_STATUS_NOT_INITIALIZED";
   133   case CURAND_STATUS_ALLOCATION_FAILED:
   134     return "CURAND_STATUS_ALLOCATION_FAILED";
   135   case CURAND_STATUS_TYPE_ERROR:
   136     return "CURAND_STATUS_TYPE_ERROR";
   137   case CURAND_STATUS_OUT_OF_RANGE:
   138     return "CURAND_STATUS_OUT_OF_RANGE";
   139   case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
   140     return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
   141   case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
   142     return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
   143   case CURAND_STATUS_LAUNCH_FAILURE:
   144     return "CURAND_STATUS_LAUNCH_FAILURE";
   145   case CURAND_STATUS_PREEXISTING_FAILURE:
   146     return "CURAND_STATUS_PREEXISTING_FAILURE";
   147   case CURAND_STATUS_INITIALIZATION_FAILED:
   148     return "CURAND_STATUS_INITIALIZATION_FAILED";
   149   case CURAND_STATUS_ARCH_MISMATCH:
   150     return "CURAND_STATUS_ARCH_MISMATCH";
   151   case CURAND_STATUS_INTERNAL_ERROR:
   152     return "CURAND_STATUS_INTERNAL_ERROR";
   154   return "Unknown cuRAND status";
   157 template <
typename DType>
   158 inline DType __device__ 
CudaMax(DType a, DType b) {
   159     return a > b ? a : b;
   162 template <
typename DType>
   163 inline DType __device__ 
CudaMin(DType a, DType b) {
   164     return a < b ? a : b;
   175 #define CHECK_CUDA_ERROR(msg)                                                \   177     cudaError_t e = cudaGetLastError();                                      \   178     CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \   187 #define CUDA_CALL(func)                                            \   189     cudaError_t e = (func);                                        \   190     CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)       \   191         << "CUDA: " << cudaGetErrorString(e);                      \   200 #define CUBLAS_CALL(func)                                       \   202     cublasStatus_t e = (func);                                  \   203     CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                          \   204         << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \   213 #define CUSOLVER_CALL(func)                                         \   215     cusolverStatus_t e = (func);                                    \   216     CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS)                            \   217         << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \   226 #define CURAND_CALL(func)                                       \   228     curandStatus_t e = (func);                                  \   229     CHECK_EQ(e, CURAND_STATUS_SUCCESS)                          \   230         << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \   239 #define NVRTC_CALL(x)                                   \   241     nvrtcResult result = x;                             \   242     CHECK_EQ(result, NVRTC_SUCCESS)                     \   243       << #x " failed with error "                       \   244       << nvrtcGetErrorString(result);                   \   253 #define CUDA_DRIVER_CALL(func)                                          \   255     CUresult e = (func);                                                \   256     if (e != CUDA_SUCCESS) {                                            \   257       char const * err_msg = nullptr;                                         \   258       if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) {  \   259         LOG(FATAL) << "CUDA Driver: Unknown error " << e;               \   261         LOG(FATAL) << "CUDA Driver: " << err_msg;                       \   267 #if !defined(_MSC_VER)   268 #define CUDA_UNROLL _Pragma("unroll")   269 #define CUDA_NOUNROLL _Pragma("nounroll")   272 #define CUDA_NOUNROLL   283                                    cudaDevAttrComputeCapabilityMajor, device_id));
   295                                    cudaDevAttrComputeCapabilityMinor, device_id));
   307   return 10 * major + minor;
   322     return (computeCapabilityMajor > 5) ||
   335   return device_id >= 0 &&
   340 #define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true   350   static bool allow_tensor_core = 
false;
   351   static bool is_set = 
false;
   355     allow_tensor_core = dmlc::GetEnv(
"MXNET_CUDA_ALLOW_TENSOR_CORE",
   356                                      dmlc::optional<bool>(default_value)).value();
   359   return allow_tensor_core;
   362 #if CUDA_VERSION >= 9000   364 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
   365   auto handle_math_mode = CUBLAS_DEFAULT_MATH;
   366   CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));
   367   CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));
   368   return handle_math_mode;
   372 #endif  // MXNET_USE_CUDA   378 #define CUDNN_CALL(func)                                                      \   380     cudnnStatus_t e = (func);                                                 \   381     CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \   391 inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {
   394   CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));
   408 inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
   411   CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));
   425 inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {
   428   CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));
   435 #endif  // MXNET_USE_CUDNN   438 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600   440 static inline  __device__  
void atomicAdd(
double *address, 
double val) {
   441   unsigned long long* address_as_ull =                  
   442     reinterpret_cast<unsigned long long*
>(address);     
   443   unsigned long long old = *address_as_ull;             
   444   unsigned long long assumed;                           
   448     old = atomicCAS(address_as_ull, assumed,
   449                     __double_as_longlong(val +
   450                     __longlong_as_double(assumed)));
   453   } 
while (assumed != old);
   460 #if defined(__CUDA_ARCH__)   461 static inline __device__ 
void atomicAdd(mshadow::half::half_t *address,
   462                                         mshadow::half::half_t val) {
   463   unsigned int *address_as_ui =
   464       reinterpret_cast<unsigned int *
>(
reinterpret_cast<char *
>(address) -
   465                                    (reinterpret_cast<size_t>(address) & 2));
   466   unsigned int old = *address_as_ui;
   467   unsigned int assumed;
   471     mshadow::half::half_t hsum;
   473         reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
   475     old = 
reinterpret_cast<size_t>(address) & 2
   476               ? (old & 0xffff) | (hsum.half_ << 16)
   477               : (old & 0xffff0000) | hsum.half_;
   478     old = atomicCAS(address_as_ui, assumed, old);
   479   } 
while (assumed != old);
   483 static inline  __device__  
void atomicAdd(int64_t *address, int64_t val) {
   484   atomicAdd(reinterpret_cast<unsigned long long*>(address), static_cast<unsigned long long>(val)); 
   487 template <
typename DType>
   488 __device__ 
inline DType ldg(
const DType* address) {
   489 #if __CUDA_ARCH__ >= 350   490     return __ldg(address);
   497 #endif  // MXNET_COMMON_CUDA_UTILS_H_ #define CUBLAS_CALL(func)
Protected cuBLAS call. 
Definition: cuda_utils.h:200
 
int ComputeCapabilityMajor(int device_id)
Determine major version number of the gpu's cuda compute architecture. 
Definition: cuda_utils.h:280
 
namespace of mxnet 
Definition: base.h:127
 
bool GetEnvAllowTensorCore()
Returns global policy for TensorCore algo use. 
Definition: cuda_utils.h:346
 
int SMArch(int device_id)
Return the integer SM architecture (e.g. Volta = 70). 
Definition: cuda_utils.h:304
 
DType __device__ CudaMin(DType a, DType b)
Definition: cuda_utils.h:163
 
bool SupportsFloat16Compute(int device_id)
Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is...
Definition: cuda_utils.h:316
 
DType __device__ CudaMax(DType a, DType b)
Definition: cuda_utils.h:158
 
bool SupportsTensorCore(int device_id)
Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_i...
Definition: cuda_utils.h:333
 
const char * CusolverGetErrorString(cusolverStatus_t error)
Get string representation of cuSOLVER errors. 
Definition: cuda_utils.h:96
 
#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT
Definition: cuda_utils.h:340
 
const char * CurandGetErrorString(curandStatus_t status)
Get string representation of cuRAND errors. 
Definition: cuda_utils.h:125
 
int ComputeCapabilityMinor(int device_id)
Determine minor version number of the gpu's cuda compute architecture. 
Definition: cuda_utils.h:292
 
#define CUDA_CALL(func)
Protected CUDA call. 
Definition: cuda_utils.h:187
 
const char * CublasGetErrorString(cublasStatus_t error)
Get string representation of cuBLAS errors. 
Definition: cuda_utils.h:65