25 #ifndef MSHADOW_TENSOR_GPU_INL_H_    26 #define MSHADOW_TENSOR_GPU_INL_H_    37   cudaGetDeviceCount(&device_count);
    38   CHECK_GT(device_count, 0) << 
"Cannot find CUDA device. Please check CUDA-Configuration";
    44   CHECK_LT(device_id, device_count) << 
"Incorrect Device ID";
    55 template<
int dim, 
typename DType>
    61                                       obj->
size(dim - 1) * 
sizeof(DType),
    62                                       obj->
shape_.FlatTo2D()[0]));
    67                                       obj->
shape_.Size() * 
sizeof(DType), 1));
    70 template<
int dim, 
typename DType>
    75 template<
typename A, 
typename B, 
int dim, 
typename DType>
    80   CHECK_EQ(_dst.
shape_, _src.
shape_) << 
"Copy:shape mismatch";
    85                                       dst.
size(1) * 
sizeof(DType),
    93 template<
int dim, 
typename DType>
    97   Copy(dst, src, cudaMemcpyDeviceToHost, stream);
    99 template<
int dim, 
typename DType>
   103   Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
   105 template<
int dim, 
typename DType>
   109   Copy(dst, src, cudaMemcpyHostToDevice, stream);
   111 #endif  // MSHADOW_USE_CUDA   116 #include "./cuda/tensor_gpu-inl.cuh"   119 template<
typename Saver, 
typename R, 
int dim,
   120          typename DType, 
typename E, 
int etype>
   124       ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
   127   CHECK(eshape[0] == 0 || eshape == dshape)
   128     << 
"Assignment: Shape of Tensors are not consistent with target, "   129     << 
"eshape: " << eshape << 
" dshape:" << dshape;
   136 template<
typename Saver, 
typename Reducer,
   137          typename R, 
typename DType, 
typename E, 
int etype>
   142       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   144       ::Check(exp.
self()).FlatTo2D();
   146   CHECK_EQ(eshape[1], dshape[0]) << 
"MapReduceKeepLowest::reduction dimension do not match";
   147   CHECK_NE(eshape[0], 0U) << 
"can not reduce over empty tensor";
   148   cuda::MapReduceKeepLowest<Saver, Reducer>
   153 template<
typename Saver, 
typename Reducer, 
int dimkeep,
   154          typename R, 
typename DType, 
typename E, 
int etype>
   159       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   164   CHECK_EQ(eshape[dimkeep], dshape[0]) << 
"MapReduceKeepHighDim::reduction dimension do not match";
   168                            eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
   169                            eshape[EShape::kSubdim]);
   171   cuda::MapReduceKeepDim1<Saver, Reducer>
   175 template<
typename DType>
   181 template<
typename DType>
   187 template<
typename DType>
   194 template<
typename DType>
   202 template<
typename DType>
   206                         const DType &ignore_label) {
   210 template<
typename DType>
   214                               const DType &ignore_label,
   219 template<
typename DType>
   226 template<
typename DType>
   230                         const DType &ignore_label) {
   234 template<
bool clip, 
typename IndexType, 
typename DType>
   238   cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
   241 template<
typename IndexType, 
typename DType>
   249 template<
typename KDType, 
typename VDType>
   255 template<
typename IndexType, 
typename DType>
   263 #endif  // MSHADOW_TENSOR_GPU_INL_H_ void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL. 
Definition: tensor_cpu-inl.h:140
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:547
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient. 
Definition: tensor_cpu-inl.h:306
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:323
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] 
Definition: pad.h:71
DType * dptr_
pointer to the data 
Definition: tensor.h:434
Tensor RValue, this is the super type of all kinds of possible tensors. 
Definition: tensor.h:409
const SubType & self(void) const
Definition: expression.h:82
used to help static type check 
Definition: expr_engine-inl.h:330
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape 
Definition: tensor_cpu-inl.h:145
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan. 
Definition: tensor_cpu-inl.h:207
Definition: stream_gpu-inl.h:37
Shape< dimension > shape_
shape of the tensor 
Definition: tensor.h:436
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0 
Definition: tensor.h:240
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:558
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:483
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow. 
Definition: base.h:278
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:223
static Shape< dim > Check(const E &t)
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
Definition: expr_engine-inl.h:345
int32_t index_t
type that will be used for index 
Definition: base.h:343
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together 
Definition: tensor.h:519
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension 
Definition: tensor.h:505
void ShutdownTensorEngine< gpu >(void)
Definition: tensor_gpu-inl.h:49
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:537
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:364
void InitTensorEngine< gpu >(int dev_id)
Definition: tensor_gpu-inl.h:33
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:250
defines how expression exp can be evaluated and stored into dst 
Definition: expression.h:79
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239
void SetDevice< gpu >(int devid)
Definition: tensor_gpu-inl.h:52
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:516
overloaded + operator between half_t and bf16_t 
Definition: base.h:334
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:441
general tensor 
Definition: tensor.h:420
#define MSHADOW_MIN_PAD_RATIO
x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide a...
Definition: base.h:83
computaion stream structure, used for asynchronous computations 
Definition: tensor.h:383