25 #ifndef MSHADOW_TENSOR_GPU_INL_H_ 26 #define MSHADOW_TENSOR_GPU_INL_H_ 37 cudaGetDeviceCount(&device_count);
38 CHECK_GT(device_count, 0) <<
"Cannot find CUDA device. Please check CUDA-Configuration";
44 CHECK_LT(device_id, device_count) <<
"Incorrect Device ID";
55 template<
int dim,
typename DType>
61 obj->
size(dim - 1) *
sizeof(DType),
62 obj->
shape_.FlatTo2D()[0]));
67 obj->
shape_.Size() *
sizeof(DType), 1));
70 template<
int dim,
typename DType>
75 template<
typename A,
typename B,
int dim,
typename DType>
80 CHECK_EQ(_dst.
shape_, _src.
shape_) <<
"Copy:shape mismatch";
85 dst.
size(1) *
sizeof(DType),
93 template<
int dim,
typename DType>
97 Copy(dst, src, cudaMemcpyDeviceToHost, stream);
99 template<
int dim,
typename DType>
103 Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
105 template<
int dim,
typename DType>
109 Copy(dst, src, cudaMemcpyHostToDevice, stream);
111 #endif // MSHADOW_USE_CUDA 116 #include "./cuda/tensor_gpu-inl.cuh" 119 template<
typename Saver,
typename R,
int dim,
120 typename DType,
typename E,
int etype>
124 ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
127 CHECK(eshape[0] == 0 || eshape == dshape)
128 <<
"Assignment: Shape of Tensors are not consistent with target, " 129 <<
"eshape: " << eshape <<
" dshape:" << dshape;
136 template<
typename Saver,
typename Reducer,
137 typename R,
typename DType,
typename E,
int etype>
142 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
144 ::Check(exp.
self()).FlatTo2D();
146 CHECK_EQ(eshape[1], dshape[0]) <<
"MapReduceKeepLowest::reduction dimension do not match";
147 CHECK_NE(eshape[0], 0U) <<
"can not reduce over empty tensor";
148 cuda::MapReduceKeepLowest<Saver, Reducer>
153 template<
typename Saver,
typename Reducer,
int dimkeep,
154 typename R,
typename DType,
typename E,
int etype>
159 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
164 CHECK_EQ(eshape[dimkeep], dshape[0]) <<
"MapReduceKeepHighDim::reduction dimension do not match";
168 eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
169 eshape[EShape::kSubdim]);
171 cuda::MapReduceKeepDim1<Saver, Reducer>
175 template<
typename DType>
181 template<
typename DType>
187 template<
typename DType>
194 template<
typename DType>
202 template<
typename DType>
206 const DType &ignore_label) {
210 template<
typename DType>
214 const DType &ignore_label,
219 template<
typename DType>
226 template<
typename DType>
230 const DType &ignore_label) {
234 template<
bool clip,
typename IndexType,
typename DType>
238 cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
241 template<
typename IndexType,
typename DType>
249 template<
typename KDType,
typename VDType>
255 template<
typename IndexType,
typename DType>
263 #endif // MSHADOW_TENSOR_GPU_INL_H_ void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:140
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:547
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:306
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:323
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
Definition: pad.h:71
DType * dptr_
pointer to the data
Definition: tensor.h:434
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:409
const SubType & self(void) const
Definition: expression.h:82
used to help static type check
Definition: expr_engine-inl.h:330
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:145
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:207
Definition: stream_gpu-inl.h:37
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:436
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:240
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:558
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:483
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:278
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:223
static Shape< dim > Check(const E &t)
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
Definition: expr_engine-inl.h:345
int32_t index_t
type that will be used for index
Definition: base.h:343
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:519
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:505
void ShutdownTensorEngine< gpu >(void)
Definition: tensor_gpu-inl.h:49
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:537
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:364
void InitTensorEngine< gpu >(int dev_id)
Definition: tensor_gpu-inl.h:33
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:250
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:79
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239
void SetDevice< gpu >(int devid)
Definition: tensor_gpu-inl.h:52
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:516
overloaded + operator between half_t and bf16_t
Definition: base.h:334
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:441
general tensor
Definition: tensor.h:420
#define MSHADOW_MIN_PAD_RATIO
x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide a...
Definition: base.h:83
computaion stream structure, used for asynchronous computations
Definition: tensor.h:383