mxnet
base.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
24 #ifndef MXNET_BASE_H_
25 #define MXNET_BASE_H_
26 
27 #include "dmlc/base.h"
28 #include <string>
29 #include "dmlc/io.h"
30 #include "dmlc/type_traits.h"
31 #include "dmlc/parameter.h"
32 #include "mshadow/tensor.h"
33 // nnvm headers for symbolic construction.
34 #include "nnvm/op.h"
35 #include "nnvm/symbolic.h"
36 #include "libinfo.h"
37 #include "tuple.h"
38 
39 
44 #if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
45 #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
46 #error "Currently we need g++ 4.8 or higher to fully support c++11 features"
47 #define override
48 #define final
49 #endif
50 #endif
51 
55 #ifdef _MSC_VER
56 #ifdef MXNET_EXPORTS
57 #define MXNET_API __declspec(dllexport)
58 #else
59 #define MXNET_API __declspec(dllimport)
60 #endif
61 #else
62 #define MXNET_API
63 #endif
64 
68 #ifndef MXNET_PREDICT_ONLY
69 #define MXNET_PREDICT_ONLY 0
70 #endif
71 
73 #define MXNET_MAJOR 1
74 
75 #define MXNET_MINOR 9
76 
77 #define MXNET_PATCH 1
78 
79 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
80 
81 #define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
82 
85 #define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)
86 
88 namespace mxnet {
90 typedef mshadow::cpu cpu;
92 typedef mshadow::gpu gpu;
98 using Op = nnvm::Op;
99 
101 struct Context {
103  enum DeviceType {
108  };
112  int32_t dev_id;
114  Context() : dev_type(kCPU), dev_id(0) {}
119  inline DeviceType dev_mask() const {
120  if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
121  return dev_type;
122  }
126  inline int real_dev_id() const {
127  if (dev_type == kCPUPinned || dev_type == kGPU) return dev_id;
128  return 0;
129  }
135  inline bool operator<(const Context &b) const;
141  inline bool operator==(const Context &b) const {
142  return dev_type == b.dev_type && dev_id == b.dev_id;
143  }
149  inline bool operator!=(const Context &b) const {
150  return !(*this == b);
151  }
156  inline void Save(dmlc::Stream *strm) const {
157  strm->Write(&dev_type, sizeof(dev_type));
158  strm->Write(&dev_id, sizeof(dev_id));
159  }
165  inline bool Load(dmlc::Stream *strm) {
166  if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type)) return false;
167  if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t)) return false;
168  return true;
169  }
171  static const int32_t kMaxDevType = 6;
173  static const int32_t kMaxDevID = 16;
179  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
181  inline static Context CPU(int32_t dev_id = 0);
187  inline static Context GPU(int32_t dev_id = -1);
192  inline static int32_t GetGPUCount();
197  inline static bool GPUDriverPresent();
202  inline static int32_t GetGPUStreamsPerWorker();
210  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
216  inline static Context CPUPinned(int32_t dev_id = -1);
222  inline static Context CPUShared(int32_t dev_id = 0);
228  inline static Context FromString(const std::string& str);
229 
230  private:
231 #if MXNET_USE_CUDA
232  static void CudaLibChecks();
233 #endif
234 #if MXNET_USE_CUDNN
235  static void CuDNNLibChecks();
236 #endif
237 };
238 
239 #if MXNET_USE_CUDA
240 
242  public:
247  explicit GPUAuxStream(mshadow::Stream<gpu> *primary_stream) :
248  primary_stream_(primary_stream),
249  aux_stream_(primary_stream),
250  gpu_stream_sync_event_(nullptr) {
251  if (Context::GetGPUStreamsPerWorker() >= 2) {
252  // Create auxiliary stream on the same device with the same properties as the primary stream
253  bool primary_has_blas_handle =
255  bool primary_has_dnn_handle =
257  aux_stream_ = mshadow::NewStream<gpu>(primary_has_blas_handle,
258  primary_has_dnn_handle,
259  primary_stream->dev_id);
260  MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
261  }
262  }
265  // If the aux_stream_ == primary_stream_, then we created no new streams to destroy.
266  if (aux_stream_ != primary_stream_) {
268  MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_));
269  }
270  }
275  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
276  if (aux_stream_ != primary_stream_)
277  StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_);
278  }
283  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
284  if (aux_stream_ != primary_stream_)
285  StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
286  }
288  mshadow::Stream<gpu> *GetStream() { return aux_stream_; }
295  static void StreamSync(mshadow::Stream<gpu> *s1, mshadow::Stream<gpu> *s2, cudaEvent_t event) {
296  MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
297  MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
298  }
299 
300  private:
301  mshadow::Stream<gpu> *primary_stream_;
302  mshadow::Stream<gpu> *aux_stream_;
303  cudaEvent_t gpu_stream_sync_event_;
304 };
305 
315  public:
320  explicit SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
321  gpu_aux_stream_->PreAuxStreamUseSync();
322  }
325  gpu_aux_stream_->PostAuxStreamUseSync();
326  }
328  SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete;
330  void operator=(const SyncedGPUAuxStream&) = delete;
334  SyncedGPUAuxStream& operator=(SyncedGPUAuxStream&&) = default;
336  inline mshadow::Stream<gpu>* GetStream() const {
337  return gpu_aux_stream_->GetStream();
338  }
339 
340  private:
341  GPUAuxStream *gpu_aux_stream_;
342 };
343 #endif // MXNET_USE_CUDA
344 
349 struct RunContext {
355  void *stream;
359  void *aux_stream;
363  bool is_bulk;
369  template<typename xpu>
371  return static_cast<mshadow::Stream<xpu>*>(stream);
372  }
373 #if MXNET_USE_CUDA
374 
379  return SyncedGPUAuxStream(static_cast<GPUAuxStream*>(aux_stream));
380  }
381 #endif
382 
383  inline const Context& get_ctx() const {
384  return ctx;
385  }
386 };
387 } // namespace mxnet
388 
390 namespace mxnet {
391 // implementing Context
392 inline bool Context::operator<(const Context &b) const {
393  if (dev_type == b.dev_type) {
394  return dev_id < b.dev_id;
395  } else {
396  return dev_type < b.dev_type;
397  }
398 }
400  Context ctx;
401  ctx.dev_type = dev_type;
402  ctx.dev_id = dev_id < 0 ? 0 : dev_id;
403  if (dev_type & kGPU) {
404 #if MXNET_USE_CUDA
405  CudaLibChecks();
406 #endif
407 #if MXNET_USE_CUDNN
408  CuDNNLibChecks();
409 #endif
410  if (dev_id < 0) {
411 #if MXNET_USE_CUDA
412  CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
413 #else
414  LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
415 #endif
416  }
417  }
418  return ctx;
419 }
420 inline Context Context::CPU(int32_t dev_id) {
421  return Create(kCPU, dev_id);
422 }
423 
424 inline Context Context::CPUPinned(int32_t dev_id) {
425  return Create(kCPUPinned, dev_id);
426 }
427 
428 inline Context Context::CPUShared(int32_t dev_id) {
429  return Create(kCPUShared, dev_id);
430 }
431 
432 inline Context Context::GPU(int32_t dev_id) {
433  return Create(kGPU, dev_id);
434 }
435 
436 inline bool Context::GPUDriverPresent() {
437 #if MXNET_USE_CUDA
438  int cuda_driver_version = 0;
439  CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
440  return cuda_driver_version > 0;
441 #else
442  return false;
443 #endif
444 }
445 
446 inline int32_t Context::GetGPUCount() {
447 #if MXNET_USE_CUDA
448  if (!GPUDriverPresent()) {
449  return 0;
450  }
451  int32_t count;
452  cudaError_t e = cudaGetDeviceCount(&count);
453  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
454  // This is skipped for working around wheel build system with older CUDA driver.
455  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
456  return 0;
457  }
458  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
459  return count;
460 #else
461  return 0;
462 #endif
463 }
464 
465 inline int32_t Context::GetGPUStreamsPerWorker() {
466  // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS.
467  const int32_t default_num_streams = 1;
468  // The get_aux_stream() interface can supply one additional stream beyond the standard one.
469  static int32_t num_streams =
470  dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1;
471  return num_streams;
472 }
473 
474 inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
475  uint64_t *total_mem) {
476 #if MXNET_USE_CUDA
477 
478  size_t memF, memT;
479  cudaError_t e;
480 
481  int curDevice;
482  e = cudaGetDevice(&curDevice);
483  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
484 
485  e = cudaSetDevice(dev);
486  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
487 
488  e = cudaMemGetInfo(&memF, &memT);
489  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
490 
491  e = cudaSetDevice(curDevice);
492  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
493 
494  *free_mem = static_cast<uint64_t>(memF);
495  *total_mem = static_cast<uint64_t>(memT);
496 
497 #else
498  LOG(FATAL)
499  << "This call is only supported for MXNet built with CUDA support.";
500 #endif
501 }
502 
503 inline Context Context::FromString(const std::string& str) {
504  Context ret;
505  try {
506  const std::string::size_type l = str.find('(');
507  CHECK_NE(l, std::string::npos);
508  const std::string::size_type r = str.find(')');
509  CHECK_EQ(r, str.length()-1);
510 
511  const std::string type = str.substr(0, l);
512  int id = std::stoi(str.substr(l+1, r-l-1));
513  if (type == "cpu") {
514  ret = CPU(id);
515  } else if (type == "gpu") {
516  ret = GPU(id);
517  } else if (type == "cpu_pinned") {
518  ret = CPUPinned(id);
519  } else if (type == "cpu_shared") {
520  ret = CPUShared(id);
521  } else {
522  LOG(FATAL) << "Invalid context string " << str;
523  }
524  } catch (...) {
525  LOG(FATAL) << "Invalid context string " << str;
526  }
527  return ret;
528 }
529 
530 inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
531  if (ctx.dev_type == Context::kCPU) {
532  out << "cpu(";
533  } else if (ctx.dev_type == Context::kGPU) {
534  out << "gpu(";
535  } else if (ctx.dev_type == Context::kCPUPinned) {
536  out << "cpu_pinned(";
537  } else if (ctx.dev_type == Context::kCPUShared) {
538  out << "cpu_shared(";
539  } else {
540  out << "unknown(";
541  }
542  out << ctx.dev_id << ")";
543  return out;
544 }
545 
546 // describe op registration point
547 #define STRINGIZE_DETAIL(x) #x
548 #define STRINGIZE(x) STRINGIZE_DETAIL(x)
549 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
550 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
551 
552 
553 #if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
554 constexpr size_t kMKLDNNAlign = 64;
555 #endif
556 
557 } // namespace mxnet
558 
559 namespace std {
560 template<> struct hash<mxnet::Context> {
561  size_t operator()(const mxnet::Context& ctx) const {
562  size_t res = 0;
563  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
564  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
565  return res;
566  }
567 };
568 
569 #if __cplusplus < 201402L && !defined(_MSC_VER)
570 template<typename T, typename... Args>
571 inline std::unique_ptr<T> make_unique(Args&&... args) {
572  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
573 }
574 #endif
575 } // namespace std
576 
577 #include "./tensor_blob.h"
579 #endif // MXNET_BASE_H_
static const int32_t kMaxDevID
the maximal device index
Definition: base.h:173
bool is_bulk
indicator of whether this execution is run in bulk mode
Definition: base.h:363
static cudaStream_t GetStream(Stream< gpu > *stream)
returns actual cudaStream_t given an input GPU stream pointer
Definition: stream_gpu-inl.h:97
HandleState dnn_handle_ownership_
cudnn handle ownership
Definition: stream_gpu-inl.h:60
namespace of mxnet
Definition: api_registry.h:33
static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total)
get the free and total available memory on a GPU
SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream)
constructor.
Definition: base.h:320
Definition: stream_gpu-inl.h:37
DeviceType dev_mask() const
Get corresponding device mask.
Definition: base.h:119
bool operator==(const Context &b) const
check if current context equals another one
Definition: base.h:141
void PostAuxStreamUseSync()
Makes future primary stream work wait on the completion of existing aux stream work.
Definition: base.h:282
static int32_t GetGPUCount()
bool Load(dmlc::Stream *strm)
load the content from binary stream
Definition: base.h:165
mshadow::default_real_t real_t
data type that will be used to store ndarray
Definition: base.h:96
bool operator<(const Context &b) const
Comparator, used to enable Context as std::map key.
static Context GPU(int32_t dev_id=-1)
Definition: optional.h:251
GPUAuxStream(mshadow::Stream< gpu > *primary_stream)
constructor.
Definition: base.h:247
Context ctx
base Context
Definition: base.h:351
SyncedGPUAuxStream get_gpu_aux_stream() const
get an RAII object that transparently handles the syncing of the auxiliary stream.
Definition: base.h:378
static const int32_t kMaxDevType
the maximal device type
Definition: base.h:171
execution time context. The information needed in runtime for actual execution.
Definition: base.h:349
interface of stream I/O for serialization
Definition: io.h:30
Holds an auxiliary mshadow gpu stream that can be synced with a primary stream.
Definition: base.h:241
mshadow::Stream< gpu > * GetStream()
Getter for created auxiliary stream.
Definition: base.h:288
Stream< gpu > * NewStream< gpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: stream_gpu-inl.h:210
Provides automatic coordination of an auxilary stream with a primary one. This object, upon construction, prepares an aux stream for use by syncing it with enqueued primary-stream work. Object destruction will sync again so future primary-stream work will wait on enqueued aux-stream work. If MXNET_GPU_WORKER_NSTREAMS == 1, then this defaults simply: the primary stream will equal the aux stream and the syncs will be executed as nops. See ./src/operator/cudnn/cudnn_convolution-inl.h for a usage example.
Definition: base.h:314
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:278
void * aux_stream
the auxiliary stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:359
~GPUAuxStream()
destructor
Definition: base.h:264
mshadow::Stream< xpu > * get_stream() const
get mshadow stream from Context
Definition: base.h:370
DeviceType dev_type
the device type we run the op on
Definition: base.h:110
Definition: base.h:104
static bool GPUDriverPresent()
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
device name CPU
Definition: tensor.h:39
device name GPU
Definition: tensor.h:46
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:50
HandleState blas_handle_ownership_
cudnn handle
Definition: stream_gpu-inl.h:56
static int32_t GetGPUStreamsPerWorker()
size_t HashCombine(size_t key, const T &value)
hash an object and combines the key with previous keys
Definition: common.h:37
int dev_id
dev id
Definition: stream_gpu-inl.h:64
int32_t dev_id
device id we are going to run it on
Definition: base.h:112
Definition: base.h:106
#define MSHADOW_CATCH_ERROR(func)
Run function and catch error, log unknown error.
Definition: base.h:292
int32_t index_t
type that will be used for index
Definition: base.h:343
void * stream
the stream of the device, can be nullptr or Stream<gpu>* in GPU mode
Definition: base.h:355
mshadow::gpu gpu
mxnet gpu
Definition: base.h:92
float default_real_t
float point type that will be used in default by mshadow
Definition: base.h:355
Definition: base.h:105
DeviceType
Type of device.
Definition: base.h:103
int real_dev_id() const
Returns dev_id for kGPU and kCPUPinned, 0 otherwise.
Definition: base.h:126
static Context CPUShared(int32_t dev_id=0)
cudaStream_t stream_
cudaStream
Definition: stream_gpu-inl.h:44
mshadow::cpu cpu
mxnet cpu
Definition: base.h:90
virtual size_t Read(void *ptr, size_t size)=0
reads data from a stream
~SyncedGPUAuxStream()
destructor
Definition: base.h:324
nnvm::Op Op
operator structure from NNVM
Definition: base.h:98
static const int kDevMask
device flag number, identifies this device
Definition: tensor.h:43
Context()
default constructor
Definition: base.h:114
static Context Create(DeviceType dev_type, int32_t dev_id=-1)
Create a new context.
Data structure Tuple and TShape to store dynamic sized shapes.
static Context CPU(int32_t dev_id=0)
virtual void Write(const void *ptr, size_t size)=0
writes data to a stream
void Save(dmlc::Stream *strm) const
save the content into binary stream
Definition: base.h:156
static Context CPUPinned(int32_t dev_id=-1)
Operator information structor.
void PreAuxStreamUseSync()
Makes future aux stream work wait on the completion of existing primary stream work.
Definition: base.h:274
Definition: base.h:107
static Context FromString(const std::string &str)
mshadow::Stream< gpu > * GetStream() const
Getter for underlying mshadow::Stream<gpu>.
Definition: base.h:336
const Context & get_ctx() const
get the base Context from RunContext
Definition: base.h:383
static void StreamSync(mshadow::Stream< gpu > *s1, mshadow::Stream< gpu > *s2, cudaEvent_t event)
Make future work enqueued to s2 wait on completion of current work enqueued to s1.
Definition: base.h:295
mshadow::index_t index_t
index type usually use unsigned
Definition: base.h:94
TBlob class that holds common representation of arbirary dimension tensor, can be used to transformed...
Symbolic graph construction API.
std::ostream & operator<<(std::ostream &os, const optional< T > &t)
serialize an optional object to string.
Definition: optional.h:151
Context information about the execution environment.
Definition: base.h:101
Provide lightweight util to do parameter setup and checking.
type traits information header
Operator structure.
Definition: op.h:103
void DeleteStream< gpu >(Stream< gpu > *stream)
Definition: stream_gpu-inl.h:200
get features of the MXNet library at runtime
bool operator!=(const Context &b) const
check if current context not equals another one
Definition: base.h:149
computaion stream structure, used for asynchronous computations
Definition: tensor.h:383