mxnet
tensor_gpu-inl.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MSHADOW_TENSOR_GPU_INL_H_
26 #define MSHADOW_TENSOR_GPU_INL_H_
27 #include "./base.h"
28 #include "./tensor.h"
29 
30 namespace mshadow {
31 #if MSHADOW_USE_CUDA
32 template<>
33 inline void InitTensorEngine<gpu>(int dev_id) {
34  cudaDeviceProp prop;
35  int device_id = 0;
36  int device_count = 0;
37  cudaGetDeviceCount(&device_count);
38  CHECK_GT(device_count, 0) << "Cannot find CUDA device. Please check CUDA-Configuration";
39  if (dev_id < 0) {
40  device_id = 0;
41  } else {
42  device_id = dev_id;
43  }
44  CHECK_LT(device_id, device_count) << "Incorrect Device ID";
45  MSHADOW_CUDA_CALL(cudaSetDevice(device_id));
46  MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
47 }
48 template<>
49 inline void ShutdownTensorEngine<gpu>(void) {
50 }
51 template<>
52 inline void SetDevice<gpu>(int devid) {
53  MSHADOW_CUDA_CALL(cudaSetDevice(devid));
54 }
55 template<int dim, typename DType>
56 inline void AllocSpace(Tensor<gpu, dim, DType> *obj, bool pad) {
57  size_t pitch;
58  // common choice for cuda mem align unit is 32
59  if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) {
60  MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
61  obj->size(dim - 1) * sizeof(DType),
62  obj->shape_.FlatTo2D()[0]));
63  obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
64  } else {
65  obj->stride_ = obj->size(dim - 1);
66  MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
67  obj->shape_.Size() * sizeof(DType), 1));
68  }
69 }
70 template<int dim, typename DType>
72  MSHADOW_CUDA_CALL(cudaFree(obj->dptr_));
73  obj->dptr_ = NULL;
74 }
75 template<typename A, typename B, int dim, typename DType>
76 inline void Copy(Tensor<A, dim, DType> _dst,
78  cudaMemcpyKind kind,
79  Stream<gpu> *stream) {
80  CHECK_EQ(_dst.shape_, _src.shape_) << "Copy:shape mismatch";
81  Tensor<A, 2, DType> dst = _dst.FlatTo2D();
82  Tensor<B, 2, DType> src = _src.FlatTo2D();
83  MSHADOW_CUDA_CALL(cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType),
84  src.dptr_, src.stride_ * sizeof(DType),
85  dst.size(1) * sizeof(DType),
86  dst.size(0), kind,
87  Stream<gpu>::GetStream(stream)));
88  // use synchronize call behavior for zero stream
89  if (stream == NULL) {
90  MSHADOW_CUDA_CALL(cudaStreamSynchronize(0));
91  }
92 }
93 template<int dim, typename DType>
94 inline void Copy(Tensor<cpu, dim, DType> dst,
95  const Tensor<gpu, dim, DType> &src,
96  Stream<gpu> *stream) {
97  Copy(dst, src, cudaMemcpyDeviceToHost, stream);
98 }
99 template<int dim, typename DType>
101  const Tensor<gpu, dim, DType> &src,
102  Stream<gpu> *stream) {
103  Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
104 }
105 template<int dim, typename DType>
107  const Tensor<cpu, dim, DType> &src,
108  Stream<gpu> *stream) {
109  Copy(dst, src, cudaMemcpyHostToDevice, stream);
110 }
111 #endif // MSHADOW_USE_CUDA
112 } // namespace mshadow
113 
114 // the following part is included only if compiler is nvcc
115 #ifdef __CUDACC__
116 #include "./cuda/tensor_gpu-inl.cuh"
117 
118 namespace mshadow {
119 template<typename Saver, typename R, int dim,
120  typename DType, typename E, int etype>
121 inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
122  const expr::Exp<E, DType, etype> &exp) {
124  ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
127  CHECK(eshape[0] == 0 || eshape == dshape)
128  << "Assignment: Shape of Tensors are not consistent with target, "
129  << "eshape: " << eshape << " dshape:" << dshape;
130  cuda::MapPlan<Saver>(MakePlan(dst->self()),
131  MakePlan(exp.self()),
132  dshape.FlatTo2D(),
134 }
135 
136 template<typename Saver, typename Reducer,
137  typename R, typename DType, typename E, int etype>
139  const expr::Exp<E, DType, etype> &exp,
140  DType scale) {
142  ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
144  ::Check(exp.self()).FlatTo2D();
146  CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
147  CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
148  cuda::MapReduceKeepLowest<Saver, Reducer>
149  (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape,
151 }
152 
153 template<typename Saver, typename Reducer, int dimkeep,
154  typename R, typename DType, typename E, int etype>
156  const expr::Exp<E, DType, etype> &exp,
157  DType scale) {
159  ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
160  typedef Shape<expr::ExpInfo<E>::kDim> EShape;
161  EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
162  ::Check(exp.self());
164  CHECK_EQ(eshape[dimkeep], dshape[0]) << "MapReduceKeepHighDim::reduction dimension do not match";
165  // use equvalent form
166  Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
167  eshape[dimkeep],
168  eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
169  eshape[EShape::kSubdim]);
170  // call equavalent map red dim 2
171  cuda::MapReduceKeepDim1<Saver, Reducer>
172  (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape,
174 }
175 template<typename DType>
176 inline void Softmax(Tensor<gpu, 2, DType> dst,
177  const Tensor<gpu, 2, DType>& src) {
178  cuda::Softmax(dst, src);
179 }
180 
181 template<typename DType>
182 inline void Softmax(Tensor<gpu, 3, DType> dst,
183  const Tensor<gpu, 3, DType>& src) {
184  cuda::Softmax(dst, src);
185 }
186 
187 template<typename DType>
188 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
189  const Tensor<gpu, 2, DType> &src,
190  const Tensor<gpu, 1, DType> &label) {
191  cuda::SoftmaxGrad(dst, src, label);
192 }
193 
194 template<typename DType>
195 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
196  const Tensor<gpu, 2, DType> &src,
197  const Tensor<gpu, 1, DType> &label,
198  const float alpha) {
199  cuda::SmoothSoftmaxGrad(dst, src, label, alpha);
200 }
201 
202 template<typename DType>
203 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
204  const Tensor<gpu, 2, DType> &src,
205  const Tensor<gpu, 1, DType> &label,
206  const DType &ignore_label) {
207  cuda::SoftmaxGrad(dst, src, label, ignore_label);
208 }
209 
210 template<typename DType>
211 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
212  const Tensor<gpu, 2, DType> &src,
213  const Tensor<gpu, 1, DType> &label,
214  const DType &ignore_label,
215  const float alpha) {
216  cuda::SmoothSoftmaxGrad(dst, src, label, ignore_label, alpha);
217 }
218 
219 template<typename DType>
220 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
221  const Tensor<gpu, 3, DType> &src,
222  const Tensor<gpu, 2, DType> &label) {
223  cuda::SoftmaxGrad(dst, src, label);
224 }
225 
226 template<typename DType>
227 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
228  const Tensor<gpu, 3, DType> &src,
229  const Tensor<gpu, 2, DType> &label,
230  const DType &ignore_label) {
231  cuda::SoftmaxGrad(dst, src, label, ignore_label);
232 }
233 
234 template<bool clip, typename IndexType, typename DType>
235 inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
236  const Tensor<gpu, 1, IndexType>& index,
237  const Tensor<gpu, 2, DType> &src) {
238  cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
239 }
240 
241 template<typename IndexType, typename DType>
243  const Tensor<gpu, 1, IndexType>& sorted,
244  const Tensor<gpu, 1, IndexType>& index,
245  const Tensor<gpu, 2, DType> &src) {
246  cuda::AddTakeGradLargeBatch(dst, sorted, index, src);
247 }
248 
249 template<typename KDType, typename VDType>
251  bool is_ascend) {
252  cuda::SortByKey(keys, values, is_ascend);
253 }
254 
255 template<typename IndexType, typename DType>
256 inline void IndexFill(Tensor<gpu, 2, DType> dst,
257  const Tensor<gpu, 1, IndexType>& index,
258  const Tensor<gpu, 2, DType> &src) {
259  cuda::IndexFill(dst, index, src);
260 }
261 } // namespace mshadow
262 #endif // __CUDACC__
263 #endif // MSHADOW_TENSOR_GPU_INL_H_
void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:140
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:547
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:306
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:323
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
Definition: pad.h:71
DType * dptr_
pointer to the data
Definition: tensor.h:434
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:409
const SubType & self(void) const
Definition: expression.h:82
used to help static type check
Definition: expr_engine-inl.h:330
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:145
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:207
Definition: stream_gpu-inl.h:37
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:436
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:240
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:558
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:483
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:278
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:223
static Shape< dim > Check(const E &t)
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
Definition: expr_engine-inl.h:345
int32_t index_t
type that will be used for index
Definition: base.h:343
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:519
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:505
void ShutdownTensorEngine< gpu >(void)
Definition: tensor_gpu-inl.h:49
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:537
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:364
void InitTensorEngine< gpu >(int dev_id)
Definition: tensor_gpu-inl.h:33
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:250
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:79
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239
void SetDevice< gpu >(int devid)
Definition: tensor_gpu-inl.h:52
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:516
overloaded + operator between half_t and bf16_t
Definition: base.h:334
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:441
general tensor
Definition: tensor.h:420
#define MSHADOW_MIN_PAD_RATIO
x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide a...
Definition: base.h:83
computaion stream structure, used for asynchronous computations
Definition: tensor.h:383