mxnet
base.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MXNET_BASE_H_
26 #define MXNET_BASE_H_
27 
28 #include "dmlc/base.h"
29 #include <string>
30 #include "dmlc/io.h"
31 #include "dmlc/type_traits.h"
32 #include "dmlc/parameter.h"
33 #include "mshadow/tensor.h"
34 // nnvm headers for symbolic construction.
35 #include "nnvm/op.h"
36 #include "nnvm/symbolic.h"
37 #include "libinfo.h"
38 #include "tuple.h"
39 
40 
45 #if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
46 #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
47 #error "Currently we need g++ 4.8 or higher to fully support c++11 features"
48 #define override
49 #define final
50 #endif
51 #endif
52 
56 #ifdef _MSC_VER
57 #ifdef MXNET_EXPORTS
58 #define MXNET_API __declspec(dllexport)
59 #else
60 #define MXNET_API __declspec(dllimport)
61 #endif
62 #else
63 #define MXNET_API
64 #endif
65 
69 #ifndef MXNET_PREDICT_ONLY
70 #define MXNET_PREDICT_ONLY 0
71 #endif
72 
74 #define MXNET_MAJOR 1
75 
76 #define MXNET_MINOR 5
77 
78 #define MXNET_PATCH 1
79 
80 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
81 
82 #define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
83 
86 #define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)
87 
89 namespace mxnet {
91 typedef mshadow::cpu cpu;
93 typedef mshadow::gpu gpu;
97 typedef mshadow::default_real_t real_t;
99 using Op = nnvm::Op;
100 
102 struct Context {
104  enum DeviceType {
105  kCPU = cpu::kDevMask,
106  kGPU = gpu::kDevMask,
109  };
113  int32_t dev_id;
115  Context() : dev_type(kCPU), dev_id(0) {}
120  inline DeviceType dev_mask() const {
121  if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
122  return dev_type;
123  }
127  inline int real_dev_id() const {
128  if (dev_type == kCPUPinned || dev_type == kGPU) return dev_id;
129  return 0;
130  }
136  inline bool operator<(const Context &b) const;
142  inline bool operator==(const Context &b) const {
143  return dev_type == b.dev_type && dev_id == b.dev_id;
144  }
150  inline bool operator!=(const Context &b) const {
151  return !(*this == b);
152  }
157  inline void Save(dmlc::Stream *strm) const {
158  strm->Write(&dev_type, sizeof(dev_type));
159  strm->Write(&dev_id, sizeof(dev_id));
160  }
166  inline bool Load(dmlc::Stream *strm) {
167  if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type)) return false;
168  if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t)) return false;
169  return true;
170  }
172  static const int32_t kMaxDevType = 6;
174  static const int32_t kMaxDevID = 16;
180  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
182  inline static Context CPU(int32_t dev_id = 0);
188  inline static Context GPU(int32_t dev_id = -1);
193  inline static int32_t GetGPUCount();
198  inline static int32_t GetGPUStreamsPerWorker();
206  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
212  inline static Context CPUPinned(int32_t dev_id = -1);
218  inline static Context CPUShared(int32_t dev_id = 0);
224  inline static Context FromString(const std::string& str);
225 };
226 
227 #if MXNET_USE_CUDA
228 
230  public:
235  explicit GPUAuxStream(mshadow::Stream<gpu> *primary_stream) :
236  primary_stream_(primary_stream),
237  aux_stream_(primary_stream),
238  gpu_stream_sync_event_(nullptr) {
239  if (Context::GetGPUStreamsPerWorker() >= 2) {
240  // Create auxiliary stream on the same device with the same properties as the primary stream
241  bool primary_has_blas_handle =
242  primary_stream->blas_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
243  bool primary_has_dnn_handle =
244  primary_stream->dnn_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
245  aux_stream_ = mshadow::NewStream<gpu>(primary_has_blas_handle,
246  primary_has_dnn_handle,
247  primary_stream->dev_id);
248  MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
249  }
250  }
253  // If the aux_stream_ == primary_stream_, then we created no new streams to destroy.
254  if (aux_stream_ != primary_stream_) {
255  MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(aux_stream_));
256  MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_));
257  }
258  }
263  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
264  if (aux_stream_ != primary_stream_)
265  StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_);
266  }
271  // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
272  if (aux_stream_ != primary_stream_)
273  StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
274  }
276  mshadow::Stream<gpu> *GetStream() { return aux_stream_; }
283  static void StreamSync(mshadow::Stream<gpu> *s1, mshadow::Stream<gpu> *s2, cudaEvent_t event) {
284  MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
285  MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
286  }
287 
288  private:
289  mshadow::Stream<gpu> *primary_stream_;
290  mshadow::Stream<gpu> *aux_stream_;
291  cudaEvent_t gpu_stream_sync_event_;
292 };
293 
303  public:
308  explicit SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
309  gpu_aux_stream_->PreAuxStreamUseSync();
310  }
313  gpu_aux_stream_->PostAuxStreamUseSync();
314  }
316  SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete;
318  void operator=(const SyncedGPUAuxStream&) = delete;
322  SyncedGPUAuxStream& operator=(SyncedGPUAuxStream&&) = default;
324  inline mshadow::Stream<gpu>* GetStream() const {
325  return gpu_aux_stream_->GetStream();
326  }
327 
328  private:
329  GPUAuxStream *gpu_aux_stream_;
330 };
331 #endif // MXNET_USE_CUDA
332 
337 struct RunContext {
343  void *stream;
347  void *aux_stream;
351  bool is_bulk;
357  template<typename xpu>
358  inline mshadow::Stream<xpu>* get_stream() const {
359  return static_cast<mshadow::Stream<xpu>*>(stream);
360  }
361 #if MXNET_USE_CUDA
362 
367  return SyncedGPUAuxStream(static_cast<GPUAuxStream*>(aux_stream));
368  }
369 #endif
370 
371  inline const Context& get_ctx() const {
372  return ctx;
373  }
374 };
375 } // namespace mxnet
376 
378 namespace mxnet {
379 // implementing Context
380 inline bool Context::operator<(const Context &b) const {
381  if (dev_type == b.dev_type) {
382  return dev_id < b.dev_id;
383  } else {
384  return dev_type < b.dev_type;
385  }
386 }
388  Context ctx;
389  ctx.dev_type = dev_type;
390  if (dev_id < 0) {
391  ctx.dev_id = 0;
392  if (dev_type & kGPU) {
393 #if MXNET_USE_CUDA
394  CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
395 #else
396  LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
397 #endif
398  }
399  } else {
400  ctx.dev_id = dev_id;
401  }
402  return ctx;
403 }
404 inline Context Context::CPU(int32_t dev_id) {
405  return Create(kCPU, dev_id);
406 }
407 
408 inline Context Context::CPUPinned(int32_t dev_id) {
409  return Create(kCPUPinned, dev_id);
410 }
411 
412 inline Context Context::CPUShared(int32_t dev_id) {
413  return Create(kCPUShared, dev_id);
414 }
415 
416 inline Context Context::GPU(int32_t dev_id) {
417  return Create(kGPU, dev_id);
418 }
419 
420 inline int32_t Context::GetGPUCount() {
421 #if MXNET_USE_CUDA
422  int32_t count;
423  cudaError_t e = cudaGetDeviceCount(&count);
424  if (e == cudaErrorNoDevice) {
425  return 0;
426  }
427  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
428  return count;
429 #else
430  return 0;
431 #endif
432 }
433 
434 inline int32_t Context::GetGPUStreamsPerWorker() {
435  // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS.
436  const int32_t default_num_streams = 1;
437  // The get_aux_stream() interface can supply one additional stream beyond the standard one.
438  static int32_t num_streams =
439  dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1;
440  return num_streams;
441 }
442 
443 inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
444  uint64_t *total_mem) {
445 #if MXNET_USE_CUDA
446 
447  size_t memF, memT;
448  cudaError_t e;
449 
450  int curDevice;
451  e = cudaGetDevice(&curDevice);
452  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
453 
454  e = cudaSetDevice(dev);
455  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
456 
457  e = cudaMemGetInfo(&memF, &memT);
458  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
459 
460  e = cudaSetDevice(curDevice);
461  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
462 
463  *free_mem = static_cast<uint64_t>(memF);
464  *total_mem = static_cast<uint64_t>(memT);
465 
466 #else
467  LOG(FATAL)
468  << "This call is only supported for MXNet built with CUDA support.";
469 #endif
470 }
471 
472 inline Context Context::FromString(const std::string& str) {
473  Context ret;
474  try {
475  const std::string::size_type l = str.find('(');
476  CHECK_NE(l, std::string::npos);
477  const std::string::size_type r = str.find(')');
478  CHECK_EQ(r, str.length()-1);
479 
480  const std::string type = str.substr(0, l);
481  int id = std::stoi(str.substr(l+1, r-l-1));
482  if (type == "cpu") {
483  ret = CPU(id);
484  } else if (type == "gpu") {
485  ret = GPU(id);
486  } else if (type == "cpu_pinned") {
487  ret = CPUPinned(id);
488  } else if (type == "cpu_shared") {
489  ret = CPUShared(id);
490  } else {
491  LOG(FATAL) << "Invalid context string " << str;
492  }
493  } catch (...) {
494  LOG(FATAL) << "Invalid context string " << str;
495  }
496  return ret;
497 }
498 
499 inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
500  if (ctx.dev_type == Context::kCPU) {
501  out << "cpu(";
502  } else if (ctx.dev_type == Context::kGPU) {
503  out << "gpu(";
504  } else if (ctx.dev_type == Context::kCPUPinned) {
505  out << "cpu_pinned(";
506  } else if (ctx.dev_type == Context::kCPUShared) {
507  out << "cpu_shared(";
508  } else {
509  out << "unknown(";
510  }
511  out << ctx.dev_id << ")";
512  return out;
513 }
514 
515 // describe op registration point
516 #define STRINGIZE_DETAIL(x) #x
517 #define STRINGIZE(x) STRINGIZE_DETAIL(x)
518 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
519 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
520 
521 
522 #if MXNET_USE_MKLDNN == 1
523 constexpr size_t kMKLDNNAlign = 64;
524 #endif
525 
526 } // namespace mxnet
527 
528 namespace std {
529 template<> struct hash<mxnet::Context> {
530  size_t operator()(const mxnet::Context& ctx) const {
531  size_t res = 0;
532  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
533  res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
534  return res;
535  }
536 };
537 
538 #if __cplusplus < 201402L && !defined(_MSC_VER)
539 template<typename T, typename... Args>
540 inline std::unique_ptr<T> make_unique(Args&&... args) {
541  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
542 }
543 #endif
544 } // namespace std
545 
546 #include "./tensor_blob.h"
548 #endif // MXNET_BASE_H_
DeviceType dev_mask() const
Get corresponding device mask.
Definition: base.h:120
static const int32_t kMaxDevID
the maximal device index
Definition: base.h:174
bool is_bulk
indicator of whether this execution is run in bulk mode
Definition: base.h:351
namespace of mxnet
Definition: base.h:89
static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total)
get the free and total available memory on a GPU
SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream)
constructor.
Definition: base.h:308
mshadow::Stream< xpu > * get_stream() const
get mshadow stream from Context
Definition: base.h:358
void PostAuxStreamUseSync()
Makes future primary stream work wait on the completion of existing aux stream work.
Definition: base.h:270
static int32_t GetGPUCount()
bool Load(dmlc::Stream *strm)
load the content from binary stream
Definition: base.h:166
mshadow::default_real_t real_t
data type that will be used to store ndarray
Definition: base.h:97
static Context GPU(int32_t dev_id=-1)
Definition: tuple.h:709
GPUAuxStream(mshadow::Stream< gpu > *primary_stream)
constructor.
Definition: base.h:235
Context ctx
base Context
Definition: base.h:339
bool operator<(const Context &b) const
Comparator, used to enable Context as std::map key.
static const int32_t kMaxDevType
the maximal device type
Definition: base.h:172
execution time context. The information needed in runtime for actual execution.
Definition: base.h:337
Holds an auxiliary mshadow gpu stream that can be synced with a primary stream.
Definition: base.h:229
mshadow::Stream< gpu > * GetStream()
Getter for created auxiliary stream.
Definition: base.h:276
Provides automatic coordination of an auxilary stream with a primary one. This object, upon construction, prepares an aux stream for use by syncing it with enqueued primary-stream work. Object destruction will sync again so future primary-stream work will wait on enqueued aux-stream work. If MXNET_GPU_WORKER_NSTREAMS == 1, then this defaults simply: the primary stream will equal the aux stream and the syncs will be executed as nops. See ./src/operator/cudnn/cudnn_convolution-inl.h for a usage example.
Definition: base.h:302
void * aux_stream
the auxiliary stream of the device, can be NULL or Stream<gpu>* in GPU mode
Definition: base.h:347
~GPUAuxStream()
destructor
Definition: base.h:252
DeviceType dev_type
the device type we run the op on
Definition: base.h:111
Definition: base.h:105
static int32_t GetGPUStreamsPerWorker()
int32_t dev_id
device id we are going to run it on
Definition: base.h:113
Definition: base.h:107
mshadow::Stream< gpu > * GetStream() const
Getter for underlying mshadow::Stream<gpu>.
Definition: base.h:324
void * stream
the stream of the device, can be NULL or Stream<gpu>* in GPU mode
Definition: base.h:343
void Save(dmlc::Stream *strm) const
save the content into binary stream
Definition: base.h:157
mshadow::gpu gpu
mxnet gpu
Definition: base.h:93
const Context & get_ctx() const
get the base Context from RunContext
Definition: base.h:371
Definition: base.h:106
DeviceType
Type of device.
Definition: base.h:104
static Context CPUShared(int32_t dev_id=0)
mshadow::cpu cpu
mxnet cpu
Definition: base.h:91
~SyncedGPUAuxStream()
destructor
Definition: base.h:312
int real_dev_id() const
Returns dev_id for kGPU and kCPUPinned, 0 otherwise.
Definition: base.h:127
nnvm::Op Op
operator structure from NNVM
Definition: base.h:99
Context()
default constructor
Definition: base.h:115
static Context Create(DeviceType dev_type, int32_t dev_id=-1)
Create a new context.
bool operator!=(const Context &b) const
check if current context not equals another one
Definition: base.h:150
static Context CPU(int32_t dev_id=0)
std::ostream & operator<<(std::ostream &out, const NDArray &ndarray)
SyncedGPUAuxStream get_gpu_aux_stream() const
get an RAII object that transparently handles the syncing of the auxiliary stream.
Definition: base.h:366
static Context CPUPinned(int32_t dev_id=-1)
void PreAuxStreamUseSync()
Makes future aux stream work wait on the completion of existing primary stream work.
Definition: base.h:262
Data structure Tuple and TShape to store dynamic sized shapes.
Definition: base.h:108
static Context FromString(const std::string &str)
static void StreamSync(mshadow::Stream< gpu > *s1, mshadow::Stream< gpu > *s2, cudaEvent_t event)
Make future work enqueued to s2 wait on completion of current work enqueued to s1.
Definition: base.h:283
mshadow::index_t index_t
index type usually use unsigned
Definition: base.h:95
TBlob class that holds common representation of arbirary dimension tensor, can be used to transformed...
Context information about the execution environment.
Definition: base.h:102
get features of the MXNet library at runtime
bool operator==(const Context &b) const
check if current context equals another one
Definition: base.h:142
unsigned index_t
Definition: base.h:37