docs/api/tensor__gpu-inl_8h_source.html

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #ifndef MSHADOW_TENSOR_GPU_INL_H_
 #define MSHADOW_TENSOR_GPU_INL_H_
 #include "./base.h"
 #include "./tensor.h"

 namespace mshadow {
 #if MSHADOW_USE_CUDA
 template<>
 inline void InitTensorEngine<gpu>(int dev_id) {
   cudaDeviceProp prop;
   int device_id = 0;
   int device_count = 0;
   cudaGetDeviceCount(&device_count);
   CHECK_GT(device_count, 0) << "Cannot find CUDA device. Please check CUDA-Configuration";
   if (dev_id < 0) {
     device_id = 0;
   } else {
     device_id = dev_id;
   }
   CHECK_LT(device_id, device_count) << "Incorrect Device ID";
   MSHADOW_CUDA_CALL(cudaSetDevice(device_id));
   MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
 }
 template<>
 inline void ShutdownTensorEngine<gpu>(void) {
 }
 template<>
 inline void SetDevice<gpu>(int devid) {
   MSHADOW_CUDA_CALL(cudaSetDevice(devid));
 }
 template<int dim, typename DType>
 inline void AllocSpace(Tensor<gpu, dim, DType> *obj, bool pad) {
   size_t pitch;
   // common choice for cuda mem align unit is 32
   if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) {
     MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
                                       obj->size(dim - 1) * sizeof(DType),
                                       obj->shape_.FlatTo2D()[0]));
     obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
   } else {
     obj->stride_ = obj->size(dim - 1);
     MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
                                       obj->shape_.Size() * sizeof(DType), 1));
   }
 }
 template<int dim, typename DType>
 inline void FreeSpace(Tensor<gpu, dim, DType> *obj) {
   MSHADOW_CUDA_CALL(cudaFree(obj->dptr_));
   obj->dptr_ = NULL;
 }
 template<typename A, typename B, int dim, typename DType>
 inline void Copy(Tensor<A, dim, DType> _dst,
                  Tensor<B, dim, DType> _src,
                  cudaMemcpyKind kind,
                  Stream<gpu> *stream) {
   CHECK_EQ(_dst.shape_, _src.shape_) << "Copy:shape mismatch";
   Tensor<A, 2, DType> dst = _dst.FlatTo2D();
   Tensor<B, 2, DType> src = _src.FlatTo2D();
   MSHADOW_CUDA_CALL(cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType),
                                       src.dptr_, src.stride_ * sizeof(DType),
                                       dst.size(1) * sizeof(DType),
                                       dst.size(0), kind,
                                       Stream<gpu>::GetStream(stream)));
   // use synchronize call behavior for zero stream
   if (stream == NULL) {
     MSHADOW_CUDA_CALL(cudaStreamSynchronize(0));
   }
 }
 template<int dim, typename DType>
 inline void Copy(Tensor<cpu, dim, DType> dst,
                  const Tensor<gpu, dim, DType> &src,
                  Stream<gpu> *stream) {
   Copy(dst, src, cudaMemcpyDeviceToHost, stream);
 }
 template<int dim, typename DType>
 inline void Copy(Tensor<gpu, dim, DType> dst,
                  const Tensor<gpu, dim, DType> &src,
                  Stream<gpu> *stream) {
   Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
 }
 template<int dim, typename DType>
 inline void Copy(Tensor<gpu, dim, DType> dst,
                  const Tensor<cpu, dim, DType> &src,
                  Stream<gpu> *stream) {
   Copy(dst, src, cudaMemcpyHostToDevice, stream);
 }
 #endif  // MSHADOW_USE_CUDA
 }  // namespace mshadow

 // the following part is included only if compiler is nvcc
 #ifdef __CUDACC__
 #include "./cuda/tensor_gpu-inl.cuh"

 namespace mshadow {
 template<typename Saver, typename R, int dim,
          typename DType, typename E, int etype>
 inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
                    const expr::Exp<E, DType, etype> &exp) {
   expr::TypeCheckPass<expr::TypeCheck<gpu, dim, DType, E>::kMapPass>
       ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
   Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
   Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
   CHECK(eshape[0] == 0 || eshape == dshape)
     << "Assignment: Shape of Tensors are not consistent with target, "
     << "eshape: " << eshape << " dshape:" << dshape;
   cuda::MapPlan<Saver>(MakePlan(dst->self()),
                        MakePlan(exp.self()),
                        dshape.FlatTo2D(),
                        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
 }

 template<typename Saver, typename Reducer,
          typename R, typename DType, typename E, int etype>
 inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
                                 const expr::Exp<E, DType, etype> &exp,
                                 DType scale) {
   expr::TypeCheckPass<expr::TypeCheck<gpu, 1, DType, E>::kRedPass>
       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
       ::Check(exp.self()).FlatTo2D();
   Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
   CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
   CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
   cuda::MapReduceKeepLowest<Saver, Reducer>
       (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape,
        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
 }

 template<typename Saver, typename Reducer, int dimkeep,
          typename R, typename DType, typename E, int etype>
 inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
                                  const expr::Exp<E, DType, etype> &exp,
                                  DType scale) {
   expr::TypeCheckPass<expr::TypeCheck<gpu, dimkeep, DType, E>::kRedPass>
       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   typedef Shape<expr::ExpInfo<E>::kDim> EShape;
   EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
       ::Check(exp.self());
     Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
   CHECK_EQ(eshape[dimkeep], dshape[0]) << "MapReduceKeepHighDim::reduction dimension do not match";
   // use equvalent form
   Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
                            eshape[dimkeep],
                            eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
                            eshape[EShape::kSubdim]);
   // call equavalent map red dim 2
   cuda::MapReduceKeepDim1<Saver, Reducer>
       (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape,
        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
 }
 template<typename DType>
 inline void Softmax(Tensor<gpu, 2, DType> dst,
                     const Tensor<gpu, 2, DType>& src) {
   cuda::Softmax(dst, src);
 }

 template<typename DType>
 inline void Softmax(Tensor<gpu, 3, DType> dst,
                     const Tensor<gpu, 3, DType>& src) {
   cuda::Softmax(dst, src);
 }

 template<typename DType>
 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
                         const Tensor<gpu, 2, DType> &src,
                         const Tensor<gpu, 1, DType> &label) {
   cuda::SoftmaxGrad(dst, src, label);
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
                               const Tensor<gpu, 2, DType> &src,
                               const Tensor<gpu, 1, DType> &label,
                               const float alpha) {
   cuda::SmoothSoftmaxGrad(dst, src, label, alpha);
 }

 template<typename DType>
 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
                         const Tensor<gpu, 2, DType> &src,
                         const Tensor<gpu, 1, DType> &label,
                         const DType &ignore_label) {
   cuda::SoftmaxGrad(dst, src, label, ignore_label);
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
                               const Tensor<gpu, 2, DType> &src,
                               const Tensor<gpu, 1, DType> &label,
                               const DType &ignore_label,
                               const float alpha) {
   cuda::SmoothSoftmaxGrad(dst, src, label, ignore_label, alpha);
 }

 template<typename DType>
 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
                         const Tensor<gpu, 3, DType> &src,
                         const Tensor<gpu, 2, DType> &label) {
   cuda::SoftmaxGrad(dst, src, label);
 }

 template<typename DType>
 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
                         const Tensor<gpu, 3, DType> &src,
                         const Tensor<gpu, 2, DType> &label,
                         const DType &ignore_label) {
   cuda::SoftmaxGrad(dst, src, label, ignore_label);
 }

 template<bool clip, typename IndexType, typename DType>
 inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
                         const Tensor<gpu, 1, IndexType>& index,
                         const Tensor<gpu, 2, DType> &src) {
   cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
 }

 template<typename IndexType, typename DType>
 inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
                                   const Tensor<gpu, 1, IndexType>& sorted,
                                   const Tensor<gpu, 1, IndexType>& index,
                                   const Tensor<gpu, 2, DType> &src) {
   cuda::AddTakeGradLargeBatch(dst, sorted, index, src);
 }

 template<typename KDType, typename VDType>
 inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,
                       bool is_ascend) {
   cuda::SortByKey(keys, values, is_ascend);
 }

 template<typename IndexType, typename DType>
 inline void IndexFill(Tensor<gpu, 2, DType> dst,
                       const Tensor<gpu, 1, IndexType>& index,
                       const Tensor<gpu, 2, DType> &src) {
   cuda::IndexFill(dst, index, src);
 }
 }  // namespace mshadow
 #endif  // __CUDACC__
 #endif  // MSHADOW_TENSOR_GPU_INL_H_
mshadow::FreeSpace
void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:140

mshadow::IndexFill
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:547

mshadow::SoftmaxGrad
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:306

mshadow::SmoothSoftmaxGrad
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:323

mshadow::expr::pad
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
Definition: pad.h:71

mshadow::Tensor::dptr_
DType * dptr_
pointer to the data
Definition: tensor.h:434

mshadow::TRValue
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:409

mshadow::expr::Exp::self
const SubType & self(void) const
Definition: expression.h:82

mshadow::expr::TypeCheckPass
used to help static type check
Definition: expr_engine-inl.h:330

mshadow::Copy
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:145

mshadow::Shape< dim >

mshadow::MapExp
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:207

mshadow::Stream< gpu >
Definition: stream_gpu-inl.h:37

mshadow::Tensor::shape_
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:436

mshadow::Shape4
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:240

mshadow::SortByKey
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:558

mshadow::Softmax
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:483

MSHADOW_CUDA_CALL
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:278

mshadow::MapReduceKeepLowest
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:223

mshadow::expr::ShapeCheck::Check
static Shape< dim > Check(const E &t)

tensor.h
header file of tensor data structure and functions This lib requires explicit memory allocation and d...

mshadow::expr::StreamInfo
Definition: expr_engine-inl.h:345

mshadow::index_t
int32_t index_t
type that will be used for index
Definition: base.h:343

mshadow::AllocSpace
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116

mshadow::Tensor::FlatTo2D
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:519

mshadow::Tensor::size
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:505

mshadow::ShutdownTensorEngine< gpu >
void ShutdownTensorEngine< gpu >(void)
Definition: tensor_gpu-inl.h:49

mshadow::AddTakeGradLargeBatch
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:537

mshadow::expr::ShapeCheck
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:364

mshadow::InitTensorEngine< gpu >
void InitTensorEngine< gpu >(int dev_id)
Definition: tensor_gpu-inl.h:33

mshadow::MapReduceKeepHighDim
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:250

mshadow::expr::Exp
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:79

mshadow::expr::MakePlan
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239

mshadow::SetDevice< gpu >
void SetDevice< gpu >(int devid)
Definition: tensor_gpu-inl.h:52

mshadow::AddTakeGrad
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:516

mshadow
overloaded + operator between half_t and bf16_t
Definition: base.h:334

mshadow::Tensor::stride_
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:441

mshadow::Tensor
general tensor
Definition: tensor.h:420

base.h

MSHADOW_MIN_PAD_RATIO
#define MSHADOW_MIN_PAD_RATIO
x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide a...
Definition: base.h:83

mshadow::Stream
computaion stream structure, used for asynchronous computations
Definition: tensor.h:383