25 #ifndef MXNET_COMMON_CUDA_UTILS_H_ 26 #define MXNET_COMMON_CUDA_UTILS_H_ 28 #include <dmlc/logging.h> 29 #include <dmlc/parameter.h> 30 #include <dmlc/optional.h> 31 #include <mshadow/base.h> 34 #ifdef __JETBRAINS_IDE__ 39 #define __forceinline__ 41 inline void __syncthreads() {}
42 inline void __threadfence_block() {}
43 template<
class T>
inline T __clz(
const T val) {
return val; }
44 struct __cuda_fake_struct {
int x;
int y;
int z; };
45 extern __cuda_fake_struct blockDim;
46 extern __cuda_fake_struct threadIdx;
47 extern __cuda_fake_struct blockIdx;
52 #include <cuda_runtime.h> 53 #include <cublas_v2.h> 61 inline __device__
bool __is_supported_cuda_architecture() {
62 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 63 #error "Fermi and earlier GPU architectures are not supported (architecture versions less than 3.0)" 67 #endif // __CUDA_ARCH__ < 300 82 case CUBLAS_STATUS_SUCCESS:
83 return "CUBLAS_STATUS_SUCCESS";
84 case CUBLAS_STATUS_NOT_INITIALIZED:
85 return "CUBLAS_STATUS_NOT_INITIALIZED";
86 case CUBLAS_STATUS_ALLOC_FAILED:
87 return "CUBLAS_STATUS_ALLOC_FAILED";
88 case CUBLAS_STATUS_INVALID_VALUE:
89 return "CUBLAS_STATUS_INVALID_VALUE";
90 case CUBLAS_STATUS_ARCH_MISMATCH:
91 return "CUBLAS_STATUS_ARCH_MISMATCH";
92 case CUBLAS_STATUS_MAPPING_ERROR:
93 return "CUBLAS_STATUS_MAPPING_ERROR";
94 case CUBLAS_STATUS_EXECUTION_FAILED:
95 return "CUBLAS_STATUS_EXECUTION_FAILED";
96 case CUBLAS_STATUS_INTERNAL_ERROR:
97 return "CUBLAS_STATUS_INTERNAL_ERROR";
98 case CUBLAS_STATUS_NOT_SUPPORTED:
99 return "CUBLAS_STATUS_NOT_SUPPORTED";
103 return "Unknown cuBLAS status";
113 case CUSOLVER_STATUS_SUCCESS:
114 return "CUSOLVER_STATUS_SUCCESS";
115 case CUSOLVER_STATUS_NOT_INITIALIZED:
116 return "CUSOLVER_STATUS_NOT_INITIALIZED";
117 case CUSOLVER_STATUS_ALLOC_FAILED:
118 return "CUSOLVER_STATUS_ALLOC_FAILED";
119 case CUSOLVER_STATUS_INVALID_VALUE:
120 return "CUSOLVER_STATUS_INVALID_VALUE";
121 case CUSOLVER_STATUS_ARCH_MISMATCH:
122 return "CUSOLVER_STATUS_ARCH_MISMATCH";
123 case CUSOLVER_STATUS_EXECUTION_FAILED:
124 return "CUSOLVER_STATUS_EXECUTION_FAILED";
125 case CUSOLVER_STATUS_INTERNAL_ERROR:
126 return "CUSOLVER_STATUS_INTERNAL_ERROR";
127 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
128 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
132 return "Unknown cuSOLVER status";
142 case CURAND_STATUS_SUCCESS:
143 return "CURAND_STATUS_SUCCESS";
144 case CURAND_STATUS_VERSION_MISMATCH:
145 return "CURAND_STATUS_VERSION_MISMATCH";
146 case CURAND_STATUS_NOT_INITIALIZED:
147 return "CURAND_STATUS_NOT_INITIALIZED";
148 case CURAND_STATUS_ALLOCATION_FAILED:
149 return "CURAND_STATUS_ALLOCATION_FAILED";
150 case CURAND_STATUS_TYPE_ERROR:
151 return "CURAND_STATUS_TYPE_ERROR";
152 case CURAND_STATUS_OUT_OF_RANGE:
153 return "CURAND_STATUS_OUT_OF_RANGE";
154 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
155 return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
156 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
157 return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
158 case CURAND_STATUS_LAUNCH_FAILURE:
159 return "CURAND_STATUS_LAUNCH_FAILURE";
160 case CURAND_STATUS_PREEXISTING_FAILURE:
161 return "CURAND_STATUS_PREEXISTING_FAILURE";
162 case CURAND_STATUS_INITIALIZATION_FAILED:
163 return "CURAND_STATUS_INITIALIZATION_FAILED";
164 case CURAND_STATUS_ARCH_MISMATCH:
165 return "CURAND_STATUS_ARCH_MISMATCH";
166 case CURAND_STATUS_INTERNAL_ERROR:
167 return "CURAND_STATUS_INTERNAL_ERROR";
169 return "Unknown cuRAND status";
172 template <
typename DType>
173 inline DType __device__
CudaMax(DType a, DType b) {
174 return a > b ? a : b;
177 template <
typename DType>
178 inline DType __device__
CudaMin(DType a, DType b) {
179 return a < b ? a : b;
190 #define CHECK_CUDA_ERROR(msg) \ 192 cudaError_t e = cudaGetLastError(); \ 193 CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \ 202 #define CUDA_CALL(func) \ 204 cudaError_t e = (func); \ 205 CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 206 << "CUDA: " << cudaGetErrorString(e); \ 215 #define CUBLAS_CALL(func) \ 217 cublasStatus_t e = (func); \ 218 CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \ 219 << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \ 228 #define CUSOLVER_CALL(func) \ 230 cusolverStatus_t e = (func); \ 231 CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \ 232 << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \ 241 #define CURAND_CALL(func) \ 243 curandStatus_t e = (func); \ 244 CHECK_EQ(e, CURAND_STATUS_SUCCESS) \ 245 << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \ 254 #define NVRTC_CALL(x) \ 256 nvrtcResult result = x; \ 257 CHECK_EQ(result, NVRTC_SUCCESS) \ 258 << #x " failed with error " \ 259 << nvrtcGetErrorString(result); \ 268 #define CUDA_DRIVER_CALL(func) \ 270 CUresult e = (func); \ 271 if (e != CUDA_SUCCESS) { \ 272 char const * err_msg = nullptr; \ 273 if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) { \ 274 LOG(FATAL) << "CUDA Driver: Unknown error " << e; \ 276 LOG(FATAL) << "CUDA Driver: " << err_msg; \ 282 #if !defined(_MSC_VER) 283 #define CUDA_UNROLL _Pragma("unroll") 284 #define CUDA_NOUNROLL _Pragma("nounroll") 287 #define CUDA_NOUNROLL 298 cudaDevAttrComputeCapabilityMajor, device_id));
310 cudaDevAttrComputeCapabilityMinor, device_id));
322 return 10 * major + minor;
337 return (computeCapabilityMajor > 5) ||
350 return device_id >= 0 &&
355 #define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true 365 static bool allow_tensor_core =
false;
366 static bool is_set =
false;
370 allow_tensor_core = dmlc::GetEnv(
"MXNET_CUDA_ALLOW_TENSOR_CORE",
371 dmlc::optional<bool>(default_value)).value();
374 return allow_tensor_core;
377 #if CUDA_VERSION >= 9000 379 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
380 auto handle_math_mode = CUBLAS_DEFAULT_MATH;
381 CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));
382 CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));
383 return handle_math_mode;
387 #endif // MXNET_USE_CUDA 393 #define CUDNN_CALL(func) \ 395 cudnnStatus_t e = (func); \ 396 CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \ 406 inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {
409 CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));
423 inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
426 CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));
440 inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {
443 CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));
450 #endif // MXNET_USE_CUDNN 453 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 455 static inline __device__
void atomicAdd(
double *address,
double val) {
456 unsigned long long* address_as_ull =
457 reinterpret_cast<unsigned long long*
>(address);
458 unsigned long long old = *address_as_ull;
459 unsigned long long assumed;
463 old = atomicCAS(address_as_ull, assumed,
464 __double_as_longlong(val +
465 __longlong_as_double(assumed)));
468 }
while (assumed != old);
475 #if defined(__CUDA_ARCH__) 476 static inline __device__
void atomicAdd(mshadow::half::half_t *address,
477 mshadow::half::half_t val) {
478 unsigned int *address_as_ui =
479 reinterpret_cast<unsigned int *
>(
reinterpret_cast<char *
>(address) -
480 (reinterpret_cast<size_t>(address) & 2));
481 unsigned int old = *address_as_ui;
482 unsigned int assumed;
486 mshadow::half::half_t hsum;
488 reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
490 old =
reinterpret_cast<size_t>(address) & 2
491 ? (old & 0xffff) | (hsum.half_ << 16)
492 : (old & 0xffff0000) | hsum.half_;
493 old = atomicCAS(address_as_ui, assumed, old);
494 }
while (assumed != old);
497 static inline __device__
void atomicAdd(uint8_t *address, uint8_t val) {
498 unsigned int * address_as_ui = (
unsigned int *) (address - ((
size_t)address & 0x3));
499 unsigned int old = *address_as_ui;
500 unsigned int shift = (((size_t)address & 0x3) << 3);
502 unsigned int assumed;
506 sum = val +
static_cast<uint8_t
>((old >> shift) & 0xff);
507 old = (old & ~(0x000000ff << shift)) | (sum << shift);
508 old = atomicCAS(address_as_ui, assumed, old);
509 }
while (assumed != old);
512 static inline __device__
void atomicAdd(int8_t *address, int8_t val) {
513 unsigned int * address_as_ui = (
unsigned int *) (address - ((
size_t)address & 0x3));
514 unsigned int old = *address_as_ui;
515 unsigned int shift = (((size_t)address & 0x3) << 3);
517 unsigned int assumed;
521 sum = val +
static_cast<int8_t
>((old >> shift) & 0xff);
522 old = (old & ~(0x000000ff << shift)) | (sum << shift);
523 old = atomicCAS(address_as_ui, assumed, old);
524 }
while (assumed != old);
528 static inline __device__
void atomicAdd(int64_t *address, int64_t val) {
529 atomicAdd(reinterpret_cast<unsigned long long*>(address), static_cast<unsigned long long>(val));
532 template <
typename DType>
533 __device__
inline DType ldg(
const DType* address) {
534 #if __CUDA_ARCH__ >= 350 535 return __ldg(address);
542 #endif // MXNET_COMMON_CUDA_UTILS_H_ #define CUBLAS_CALL(func)
Protected cuBLAS call.
Definition: cuda_utils.h:215
int ComputeCapabilityMajor(int device_id)
Determine major version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:295
namespace of mxnet
Definition: base.h:118
bool GetEnvAllowTensorCore()
Returns global policy for TensorCore algo use.
Definition: cuda_utils.h:361
int SMArch(int device_id)
Return the integer SM architecture (e.g. Volta = 70).
Definition: cuda_utils.h:319
DType __device__ CudaMin(DType a, DType b)
Definition: cuda_utils.h:178
bool SupportsFloat16Compute(int device_id)
Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is...
Definition: cuda_utils.h:331
DType __device__ CudaMax(DType a, DType b)
Definition: cuda_utils.h:173
bool SupportsTensorCore(int device_id)
Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_i...
Definition: cuda_utils.h:348
const char * CusolverGetErrorString(cusolverStatus_t error)
Get string representation of cuSOLVER errors.
Definition: cuda_utils.h:111
#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT
Definition: cuda_utils.h:355
const char * CurandGetErrorString(curandStatus_t status)
Get string representation of cuRAND errors.
Definition: cuda_utils.h:140
int ComputeCapabilityMinor(int device_id)
Determine minor version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:307
#define CUDA_CALL(func)
Protected CUDA call.
Definition: cuda_utils.h:202
const char * CublasGetErrorString(cublasStatus_t error)
Get string representation of cuBLAS errors.
Definition: cuda_utils.h:80
Symbol sum(const std::string &symbol_name, Symbol data, dmlc::optional< Shape > axis=dmlc::optional< Shape >(), bool keepdims=false, bool exclude=false)
Definition: op.h:2546