25 #ifndef MXNET_COMMON_CUDA_UTILS_H_ 26 #define MXNET_COMMON_CUDA_UTILS_H_ 28 #include <dmlc/logging.h> 29 #include <dmlc/parameter.h> 30 #include <dmlc/optional.h> 31 #include <mshadow/base.h> 34 #ifdef __JETBRAINS_IDE__ 39 #define __forceinline__ 41 inline void __syncthreads() {}
42 inline void __threadfence_block() {}
43 template<
class T>
inline T __clz(
const T val) {
return val; }
44 struct __cuda_fake_struct {
int x;
int y;
int z; };
45 extern __cuda_fake_struct blockDim;
46 extern __cuda_fake_struct threadIdx;
47 extern __cuda_fake_struct blockIdx;
52 #include <cuda_runtime.h> 53 #include <cublas_v2.h> 67 case CUBLAS_STATUS_SUCCESS:
68 return "CUBLAS_STATUS_SUCCESS";
69 case CUBLAS_STATUS_NOT_INITIALIZED:
70 return "CUBLAS_STATUS_NOT_INITIALIZED";
71 case CUBLAS_STATUS_ALLOC_FAILED:
72 return "CUBLAS_STATUS_ALLOC_FAILED";
73 case CUBLAS_STATUS_INVALID_VALUE:
74 return "CUBLAS_STATUS_INVALID_VALUE";
75 case CUBLAS_STATUS_ARCH_MISMATCH:
76 return "CUBLAS_STATUS_ARCH_MISMATCH";
77 case CUBLAS_STATUS_MAPPING_ERROR:
78 return "CUBLAS_STATUS_MAPPING_ERROR";
79 case CUBLAS_STATUS_EXECUTION_FAILED:
80 return "CUBLAS_STATUS_EXECUTION_FAILED";
81 case CUBLAS_STATUS_INTERNAL_ERROR:
82 return "CUBLAS_STATUS_INTERNAL_ERROR";
83 case CUBLAS_STATUS_NOT_SUPPORTED:
84 return "CUBLAS_STATUS_NOT_SUPPORTED";
88 return "Unknown cuBLAS status";
98 case CUSOLVER_STATUS_SUCCESS:
99 return "CUSOLVER_STATUS_SUCCESS";
100 case CUSOLVER_STATUS_NOT_INITIALIZED:
101 return "CUSOLVER_STATUS_NOT_INITIALIZED";
102 case CUSOLVER_STATUS_ALLOC_FAILED:
103 return "CUSOLVER_STATUS_ALLOC_FAILED";
104 case CUSOLVER_STATUS_INVALID_VALUE:
105 return "CUSOLVER_STATUS_INVALID_VALUE";
106 case CUSOLVER_STATUS_ARCH_MISMATCH:
107 return "CUSOLVER_STATUS_ARCH_MISMATCH";
108 case CUSOLVER_STATUS_EXECUTION_FAILED:
109 return "CUSOLVER_STATUS_EXECUTION_FAILED";
110 case CUSOLVER_STATUS_INTERNAL_ERROR:
111 return "CUSOLVER_STATUS_INTERNAL_ERROR";
112 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
113 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
117 return "Unknown cuSOLVER status";
127 case CURAND_STATUS_SUCCESS:
128 return "CURAND_STATUS_SUCCESS";
129 case CURAND_STATUS_VERSION_MISMATCH:
130 return "CURAND_STATUS_VERSION_MISMATCH";
131 case CURAND_STATUS_NOT_INITIALIZED:
132 return "CURAND_STATUS_NOT_INITIALIZED";
133 case CURAND_STATUS_ALLOCATION_FAILED:
134 return "CURAND_STATUS_ALLOCATION_FAILED";
135 case CURAND_STATUS_TYPE_ERROR:
136 return "CURAND_STATUS_TYPE_ERROR";
137 case CURAND_STATUS_OUT_OF_RANGE:
138 return "CURAND_STATUS_OUT_OF_RANGE";
139 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
140 return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
141 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
142 return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
143 case CURAND_STATUS_LAUNCH_FAILURE:
144 return "CURAND_STATUS_LAUNCH_FAILURE";
145 case CURAND_STATUS_PREEXISTING_FAILURE:
146 return "CURAND_STATUS_PREEXISTING_FAILURE";
147 case CURAND_STATUS_INITIALIZATION_FAILED:
148 return "CURAND_STATUS_INITIALIZATION_FAILED";
149 case CURAND_STATUS_ARCH_MISMATCH:
150 return "CURAND_STATUS_ARCH_MISMATCH";
151 case CURAND_STATUS_INTERNAL_ERROR:
152 return "CURAND_STATUS_INTERNAL_ERROR";
154 return "Unknown cuRAND status";
157 template <
typename DType>
158 inline DType __device__
CudaMax(DType a, DType b) {
159 return a > b ? a : b;
162 template <
typename DType>
163 inline DType __device__
CudaMin(DType a, DType b) {
164 return a < b ? a : b;
175 #define CHECK_CUDA_ERROR(msg) \ 177 cudaError_t e = cudaGetLastError(); \ 178 CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \ 187 #define CUDA_CALL(func) \ 189 cudaError_t e = (func); \ 190 CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 191 << "CUDA: " << cudaGetErrorString(e); \ 200 #define CUBLAS_CALL(func) \ 202 cublasStatus_t e = (func); \ 203 CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \ 204 << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \ 213 #define CUSOLVER_CALL(func) \ 215 cusolverStatus_t e = (func); \ 216 CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \ 217 << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \ 226 #define CURAND_CALL(func) \ 228 curandStatus_t e = (func); \ 229 CHECK_EQ(e, CURAND_STATUS_SUCCESS) \ 230 << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \ 239 #define NVRTC_CALL(x) \ 241 nvrtcResult result = x; \ 242 CHECK_EQ(result, NVRTC_SUCCESS) \ 243 << #x " failed with error " \ 244 << nvrtcGetErrorString(result); \ 253 #define CUDA_DRIVER_CALL(func) \ 255 CUresult e = (func); \ 256 if (e != CUDA_SUCCESS) { \ 257 char const * err_msg = nullptr; \ 258 if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) { \ 259 LOG(FATAL) << "CUDA Driver: Unknown error " << e; \ 261 LOG(FATAL) << "CUDA Driver: " << err_msg; \ 267 #if !defined(_MSC_VER) 268 #define CUDA_UNROLL _Pragma("unroll") 269 #define CUDA_NOUNROLL _Pragma("nounroll") 272 #define CUDA_NOUNROLL 283 cudaDevAttrComputeCapabilityMajor, device_id));
295 cudaDevAttrComputeCapabilityMinor, device_id));
307 return 10 * major + minor;
322 return (computeCapabilityMajor > 5) ||
335 return device_id >= 0 &&
340 #define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true 350 static bool allow_tensor_core =
false;
351 static bool is_set =
false;
355 allow_tensor_core = dmlc::GetEnv(
"MXNET_CUDA_ALLOW_TENSOR_CORE",
356 dmlc::optional<bool>(default_value)).value();
359 return allow_tensor_core;
362 #if CUDA_VERSION >= 9000 364 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
365 auto handle_math_mode = CUBLAS_DEFAULT_MATH;
366 CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));
367 CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));
368 return handle_math_mode;
372 #endif // MXNET_USE_CUDA 378 #define CUDNN_CALL(func) \ 380 cudnnStatus_t e = (func); \ 381 CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \ 391 inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {
394 CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));
408 inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
411 CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));
425 inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {
428 CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));
435 #endif // MXNET_USE_CUDNN 438 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 440 static inline __device__
void atomicAdd(
double *address,
double val) {
441 unsigned long long* address_as_ull =
442 reinterpret_cast<unsigned long long*
>(address);
443 unsigned long long old = *address_as_ull;
444 unsigned long long assumed;
448 old = atomicCAS(address_as_ull, assumed,
449 __double_as_longlong(val +
450 __longlong_as_double(assumed)));
453 }
while (assumed != old);
460 #if defined(__CUDA_ARCH__) 461 static inline __device__
void atomicAdd(mshadow::half::half_t *address,
462 mshadow::half::half_t val) {
463 unsigned int *address_as_ui =
464 reinterpret_cast<unsigned int *
>(
reinterpret_cast<char *
>(address) -
465 (reinterpret_cast<size_t>(address) & 2));
466 unsigned int old = *address_as_ui;
467 unsigned int assumed;
471 mshadow::half::half_t hsum;
473 reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
475 old =
reinterpret_cast<size_t>(address) & 2
476 ? (old & 0xffff) | (hsum.half_ << 16)
477 : (old & 0xffff0000) | hsum.half_;
478 old = atomicCAS(address_as_ui, assumed, old);
479 }
while (assumed != old);
483 static inline __device__
void atomicAdd(int64_t *address, int64_t val) {
484 atomicAdd(reinterpret_cast<unsigned long long*>(address), static_cast<unsigned long long>(val));
487 template <
typename DType>
488 __device__
inline DType ldg(
const DType* address) {
489 #if __CUDA_ARCH__ >= 350 490 return __ldg(address);
497 #endif // MXNET_COMMON_CUDA_UTILS_H_ #define CUBLAS_CALL(func)
Protected cuBLAS call.
Definition: cuda_utils.h:200
int ComputeCapabilityMajor(int device_id)
Determine major version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:280
namespace of mxnet
Definition: base.h:127
bool GetEnvAllowTensorCore()
Returns global policy for TensorCore algo use.
Definition: cuda_utils.h:346
int SMArch(int device_id)
Return the integer SM architecture (e.g. Volta = 70).
Definition: cuda_utils.h:304
DType __device__ CudaMin(DType a, DType b)
Definition: cuda_utils.h:163
bool SupportsFloat16Compute(int device_id)
Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is...
Definition: cuda_utils.h:316
DType __device__ CudaMax(DType a, DType b)
Definition: cuda_utils.h:158
bool SupportsTensorCore(int device_id)
Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_i...
Definition: cuda_utils.h:333
const char * CusolverGetErrorString(cusolverStatus_t error)
Get string representation of cuSOLVER errors.
Definition: cuda_utils.h:96
#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT
Definition: cuda_utils.h:340
const char * CurandGetErrorString(curandStatus_t status)
Get string representation of cuRAND errors.
Definition: cuda_utils.h:125
int ComputeCapabilityMinor(int device_id)
Determine minor version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:292
#define CUDA_CALL(func)
Protected CUDA call.
Definition: cuda_utils.h:187
const char * CublasGetErrorString(cublasStatus_t error)
Get string representation of cuBLAS errors.
Definition: cuda_utils.h:65