docs/api/tensor__cpu-inl_8h_source.html

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #ifndef MSHADOW_TENSOR_CPU_INL_H_
 #define MSHADOW_TENSOR_CPU_INL_H_
 #include <cstring>
 #include <functional>
 #include <utility>
 #include <vector>
 #include "./base.h"
 #include "./tensor.h"
 #include "./packet-inl.h"
 #include "./dot_engine-inl.h"

 namespace mshadow {
 template<>
 inline void InitTensorEngine<cpu>(int dev_id) {
 }
 template<>
 inline void ShutdownTensorEngine<cpu>(void) {
 }

 template<>
 inline void SetDevice<cpu>(int devid) {
 }
 template<>
 inline Stream<cpu> *NewStream<cpu>(bool create_blas_handle,
                                    bool create_dnn_handle,
                                    int dev_id) {
   return new Stream<cpu>();
 }
 template<>
 inline void DeleteStream<cpu>(Stream<cpu> *stream) {
   delete stream;
 }

 template<int ndim>
 inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape) { // NOLINT(*)
   os << '(';
   for (int i = 0; i < ndim; ++i) {
     if (i != 0) os << ',';
     os << shape[i];
   }
   // python style tuple
   if (ndim == 1) os << ',';
   os << ')';
   return os;
 }

 template<typename xpu>
 inline void *AllocHost_(size_t size);
 template<typename xpu>
 inline void FreeHost_(void * dptr);

 #ifdef __CUDACC__
 template<>
 inline void *AllocHost_<gpu>(size_t size) {
   void *dptr;
   MSHADOW_CUDA_CALL(cudaMallocHost(&dptr, size, cudaHostAllocPortable));
   return dptr;
 }
 template<>
 inline void FreeHost_<gpu>(void *dptr) {
   MSHADOW_CUDA_CALL(cudaFreeHost(dptr));
 }
 #endif

 template<>
 inline void *AllocHost_<cpu>(size_t size) {
   size_t pitch;
   return packet::AlignedMallocPitch(&pitch, size, 1);
 }
 template<>
 inline void FreeHost_<cpu>(void *dptr) {
   packet::AlignedFree(dptr);
 }

 template<typename xpu, int dim, typename DType>
 inline void AllocHost(Tensor<cpu, dim, DType> *obj) {
   obj->stride_ = obj->size(dim - 1);
   CHECK_EQ(obj->CheckContiguous(), true) << "AllocHost";
   void *dptr = AllocHost_<xpu>(obj->MSize() * sizeof(DType));
   obj->dptr_ = reinterpret_cast<DType*>(dptr);
 }
 template<typename xpu, int dim, typename DType>
 inline void FreeHost(Tensor<cpu, dim, DType> *obj) {
   if (obj->dptr_ == NULL) {
     LOG(FATAL) << "FreeHost:: double free";
   }
   FreeHost_<xpu>(obj->dptr_);
   obj->dptr_ = NULL;
 }

 template<int dim, typename DType>
 inline void AllocSpace(Tensor<cpu, dim, DType> *obj, bool pad) {
   size_t pitch;
   void *dptr;
   if (pad) {
     dptr = packet::AlignedMallocPitch
         (&pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]);
     obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
   } else {
     obj->stride_ = obj->size(dim - 1);
     dptr = packet::AlignedMallocPitch
         (&pitch, obj->shape_.Size() * sizeof(DType), 1);
   }
   obj->dptr_ = reinterpret_cast<DType*>(dptr);
 }
 template<typename Device, typename DType, int dim>
 inline Tensor<Device, dim, DType>
 NewTensor(const Shape<dim> &shape, DType initv, bool pad, Stream<Device> *stream_) {
   Tensor<Device, dim, DType> obj(shape);
   obj.stream_ = stream_;
   AllocSpace(&obj, pad);
   MapExp<sv::saveto>(&obj, expr::ScalarExp<DType>(initv));
   return obj;
 }
 template<int dim, typename DType>
 inline void FreeSpace(Tensor<cpu, dim, DType> *obj) {
   packet::AlignedFree(obj->dptr_);
   obj->dptr_ = NULL;
 }
 template<int dim, typename DType>
 inline void Copy(Tensor<cpu, dim, DType> _dst,
                  const Tensor<cpu, dim, DType> &_src,
                  Stream<cpu> *stream) {
   CHECK_EQ(_dst.shape_, _src.shape_)
       << "Copy:shape mismatch:" << _dst.shape_ << " vs " << _src.shape_;
   if (_dst.CheckContiguous() && _src.CheckContiguous()) {
     memcpy(_dst.dptr_, _src.dptr_, sizeof(DType) * _dst.shape_.Size());
   } else {
     Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
     Tensor<cpu, 2, DType> src = _src.FlatTo2D();
     for (index_t y = 0; y < dst.size(0); ++y) {
       memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1));
     }
   }
 }

 template<typename Saver, typename R, int dim,
          typename DType, typename E>
 inline void MapPlan(TRValue<R, cpu, dim, DType> *dst,
                     const expr::Plan<E, DType> &plan) {
   Shape<2> shape = expr::ShapeCheck<dim, R>::Check(dst->self()).FlatTo2D();
   expr::Plan<R, DType> dplan = expr::MakePlan(dst->self());
 #ifndef __CUDACC__
   #pragma omp parallel for
 #endif
   // temp remove openmp, as default setting throttles CPU
   for (openmp_index_t y = 0; y < shape[0]; ++y) {
     for (index_t x = 0; x < shape[1]; ++x) {
       // trust your compiler! -_- they will optimize it
       Saver::template Save<DType>(dplan.REval(y, x), plan.Eval(y, x));
     }
   }
 }
 // code to handle SSE optimization
 template<bool pass_check, typename Saver,
          typename R, int dim,
          typename DType, typename E, int etype>
 struct MapExpCPUEngine {
   inline static void Map(TRValue<R, cpu, dim, DType> *dst,
                          const expr::Exp<E, DType, etype> &exp) {
     MapPlan<Saver>(dst, MakePlan(exp.self()));
   }
 };

 template<typename SV, int dim, typename DType, typename E, int etype>
 struct MapExpCPUEngine<true, SV, Tensor<cpu, dim, DType>,
                        dim, DType, E, etype> {
   inline static void Map(Tensor<cpu, dim, DType> *dst,
                          const expr::Exp<E, DType, etype> &exp) {
     if (expr::PacketAlignCheck<dim, E, MSHADOW_DEFAULT_PACKET>::Check(exp.self()) &&
         expr::PacketAlignCheck<dim, Tensor<cpu, dim, DType>, MSHADOW_DEFAULT_PACKET>::Check(*dst)) {
       expr::MapPacketPlan<SV>(dst->self(),
                               expr::MakePacketPlan<MSHADOW_DEFAULT_PACKET>(exp.self()));
     } else {
       MapPlan<SV>(dst, MakePlan(exp.self()));
     }
   }
 };


 template<typename Saver, typename R, int dim,
          typename DType, typename E, int etype>
 inline void MapExp(TRValue<R, cpu, dim, DType> *dst,
                    const expr::Exp<E, DType, etype> &exp) {
   expr::TypeCheckPass<expr::TypeCheck<cpu, dim, DType, E>::kMapPass>
       ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
   Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
   Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
   CHECK(eshape[0] == 0 || eshape == dshape)
       << "Assignment: Shape of Tensors are not consistent with target, "
       << "eshape: " << eshape << " dshape:" << dshape;
   MapExpCPUEngine<expr::PacketCheck<E, MSHADOW_DEFAULT_PACKET>::kPass,
                   Saver, R, dim, DType, E, etype>
   ::Map(dst->ptrself(), exp);
 }

 template<typename Saver, typename Reducer,
          typename R, typename DType, typename E, int etype>
 inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,
                                 const expr::Exp<E, DType, etype> &exp,
                                 DType scale) {
   expr::TypeCheckPass<expr::TypeCheck<cpu, 1, DType, E>::kRedPass>
       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
       ::Check(exp.self()).FlatTo2D();
   Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
   CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
   CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
   // execution
   expr::Plan<R, DType> dplan = MakePlan(dst->self());
   expr::Plan<E, DType> splan = MakePlan(exp.self());
 #ifndef __CUDACC__
   #pragma omp parallel for
 #endif
   for (openmp_index_t x = 0; x < eshape[1]; ++x) {
     DType res = splan.Eval(0, x);
     for (index_t y = 1; y < eshape[0]; ++y) {
       Reducer::Reduce(res, splan.Eval(y, x));
     }
     Saver::template Save<DType>(dplan.REval(0, x), res * scale);
   }
 }

 template<typename Saver, typename Reducer, int dimkeep,
          typename R, typename DType, typename E, int etype>
 inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
                                  const expr::Exp<E, DType, etype> &exp,
                                  DType scale) {
   expr::TypeCheckPass<expr::TypeCheck<cpu, dimkeep, DType, E>::kRedPass>
       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
   typedef Shape<expr::ExpInfo<E>::kDim> EShape;
   EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
       ::Check(exp.self());
   Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
   CHECK_EQ(eshape[dimkeep], dshape[0])
     << "MapReduceKeepHighDim::reduction dimension do not match";
   // use equvalent form
   Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
                            eshape[dimkeep],
                            eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
                            eshape[EShape::kSubdim]);
   // execution
   expr::Plan<R, DType> dplan = MakePlan(dst->self());
   expr::Plan<E, DType> splan = MakePlan(exp.self());
 #ifndef __CUDACC__
   #pragma omp parallel for
 #endif
   for (openmp_index_t c = 0; c < pshape[1]; ++c) {
     DType res; Reducer::SetInitValue(res);
     for (index_t n = 0; n < pshape[0]; ++n) {
       DType tres; Reducer::SetInitValue(tres);
       for (index_t y = 0; y < pshape[2]; ++y) {
         for (index_t x = 0; x < pshape[3]; ++x) {
           Reducer::Reduce(tres,
                           splan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
         }
       }
       Reducer::Reduce(res, tres);
     }
     Saver::template Save<DType>(dplan.REval(0, c), DType(res * scale));
   }
 }

 template<typename DType>
 inline void Softmax(Tensor<cpu, 1, DType> dst,
                     const Tensor<cpu, 1, DType> &energy) {
   DType mmax = energy[0];
   for (index_t x = 1; x < dst.size(0); ++x) {
     if (mmax < energy[x]) mmax = energy[x];
   }
   DType sum = DType(0.0f);
   for (index_t x = 0; x < dst.size(0); ++x) {
     dst[x] = std::exp(energy[x] - mmax);
     sum += dst[x];
   }
   for (index_t x = 0; x < dst.size(0); ++x) {
     dst[x] /= sum;
   }
 }

 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 2, DType> &src,
                         const Tensor<cpu, 1, DType> &label) {
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     const index_t k = static_cast<int>(label[y]);
     for (index_t x = 0; x < dst.size(1); ++x) {
       if (x == k) {
         dst[y][k] = src[y][k] - 1.0f;
       } else {
         dst[y][x] = src[y][x];
       }
     }
   }
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 2, DType> &src,
                         const Tensor<cpu, 1, DType> &label,
                         const float alpha) {
   const float smooth_grad = (alpha / (dst.size(1) - 1));
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     const index_t k = static_cast<int>(label[y]);
     for (index_t x = 0; x < dst.size(1); ++x) {
       if (x == k) {
         dst[y][k] = src[y][k] - 1.0f + alpha;
       } else {
         dst[y][x] = src[y][x] - smooth_grad;
       }
     }
   }
 }


 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 2, DType> &src,
                         const Tensor<cpu, 1, DType> &label,
                         const DType &ignore_label) {
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     const int k = static_cast<int>(label[y]);
     for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
       if (static_cast<int>(ignore_label) == k) {
         dst[y][x] = 0.0f;
       } else {
         if (x == k) {
           dst[y][k] = src[y][k] - 1.0f;
         } else {
           dst[y][x] = src[y][x];
         }
       }
     }
   }
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(Tensor<cpu, 2, DType> dst,
                               const Tensor<cpu, 2, DType> &src,
                               const Tensor<cpu, 1, DType> &label,
                               const DType &ignore_label,
                               const float alpha) {
   const float smooth_grad = (alpha / (dst.size(1) - 1));
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     const int k = static_cast<int>(label[y]);
     for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
       if (static_cast<int>(ignore_label) == k) {
         dst[y][x] = 0.0f;
       } else {
         if (x == k) {
           dst[y][k] = src[y][k] - 1.0f + alpha;
         } else {
           dst[y][x] = src[y][x] - smooth_grad;
         }
       }
     }
   }
 }

 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
                         const Tensor<cpu, 3, DType> &src,
                         const Tensor<cpu, 2, DType> &label) {
 #pragma omp parallel for
   for (openmp_index_t n = 0; n < dst.size(2); ++n) {
     for (index_t y = 0; y < dst.size(0); ++y) {
       const int k = static_cast<int>(label[y][n]);
       for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
         if (x == k) {
           dst[y][k][n] = src[y][k][n] - 1.0f;
         } else {
           dst[y][x][n] = src[y][x][n];
         }
       }
     }
   }
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(Tensor<cpu, 3, DType> dst,
                         const Tensor<cpu, 3, DType> &src,
                         const Tensor<cpu, 2, DType> &label,
                         const float alpha) {
   const float smooth_grad = (alpha / (dst.size(1) - 1));
 #pragma omp parallel for
   for (openmp_index_t n = 0; n < dst.size(2); ++n) {
     for (index_t y = 0; y < dst.size(0); ++y) {
       const int k = static_cast<int>(label[y][n]);
       for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
         if (x == k) {
           dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
         } else {
           dst[y][x][n] = src[y][x][n] - smooth_grad;
         }
       }
     }
   }
 }

 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
                         const Tensor<cpu, 3, DType> &src,
                         const Tensor<cpu, 2, DType> &label,
                         const DType &ignore_label) {
 #pragma omp parallel for
   for (openmp_index_t n = 0; n < dst.size(2); ++n) {
     for (index_t y = 0; y < dst.size(0); ++y) {
       const int k = static_cast<int>(label[y][n]);
       if (k == static_cast<int>(ignore_label)) {
         for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
           dst[y][x][n] = DType(0.0f);
         }
       } else {
         for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
           if (x == k) {
             dst[y][k][n] = src[y][k][n] - 1.0f;
           } else {
             dst[y][x][n] = src[y][x][n];
           }
         }
       }
     }
   }
 }

 template<typename DType>
 inline void SmoothSoftmaxGrad(Tensor<cpu, 3, DType> dst,
                         const Tensor<cpu, 3, DType> &src,
                         const Tensor<cpu, 2, DType> &label,
                         const DType &ignore_label,
                         const float alpha) {
   const float smooth_grad = (alpha / (dst.size(1) - 1));
 #pragma omp parallel for
   for (openmp_index_t n = 0; n < dst.size(2); ++n) {
     for (index_t y = 0; y < dst.size(0); ++y) {
       const int k = static_cast<int>(label[y][n]);
       if (k == static_cast<int>(ignore_label)) {
         for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
           dst[y][x][n] = DType(0.0f);
         }
       } else {
         for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
           if (x == k) {
             dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
           } else {
             dst[y][x][n] = src[y][x][n] - smooth_grad;
           }
         }
       }
     }
   }
 }

 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
                     const Tensor<cpu, 2, DType> &energy) {
   CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     Softmax(dst[y], energy[y]);
   }
 }

 template<typename DType>
 inline void Softmax(Tensor<cpu, 3, DType> dst,
                     const Tensor<cpu, 3, DType> &energy) {
   CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
 #pragma omp parallel for
   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
     for (index_t n = 0; n < dst.size(2); ++n) {
       DType mmax = energy[y][0][n];
       for (index_t x = 1; x < dst.size(1); ++x) {
         if (mmax < energy[y][x][n]) mmax = energy[y][x][n];
       }
       DType sum = DType(0.0f);
       for (index_t x = 0; x < dst.size(1); ++x) {
         dst[y][x][n] = std::exp(energy[y][x][n] - mmax);
         sum += dst[y][x][n];
       }
       for (index_t x = 0; x < dst.size(1); ++x) {
         dst[y][x][n] /= sum;
       }
     }
   }
 }

 template<bool clip, typename IndexType, typename DType>
 inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 1, IndexType>& index,
                         const Tensor<cpu, 2, DType> &src) {
   const index_t K = dst.shape_[0];
   const index_t C = dst.shape_[1];
   for (index_t y = 0; y < index.size(0); ++y) {
     index_t j = index[y];
     if (clip) {
       if (j <= 0) j = 0;
       else if (j >= K) j = K - 1;
     } else {
       j %= K;
       if (j < 0) j += K;
     }
     for (index_t i = 0; i < C; ++i) {
       dst[j][i] += src[y][i];
     }
   }
 }

 template<typename IndexType, typename DType>
 inline void AddTakeGradLargeBatch(Tensor<cpu, 2, DType> dst,
                                   const Tensor<cpu, 1, IndexType>& sorted,
                                   const Tensor<cpu, 1, IndexType>& index,
                                   const Tensor<cpu, 2, DType> &src) {
   for (index_t y = 0; y < sorted.size(0); ++y) {
     dst[sorted[y]] += src[index[y]];
   }
 }

 template<typename IndexType, typename DType>
 inline void IndexFill(Tensor<cpu, 2, DType> dst,
                       const Tensor<cpu, 1, IndexType>& index,
                       const Tensor<cpu, 2, DType> &src) {
   for (index_t y = 0; y < index.size(0); ++y) {
     for (index_t j = 0; j < src.size(1); j++) {
       dst[index[y]][j] = src[y][j];
     }
   }
 }

 template<typename KDType, typename VDType>
 inline void SortByKey(Tensor<cpu, 1, KDType> keys, Tensor<cpu, 1, VDType> values,
                       bool is_ascend) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
   CHECK_EQ(keys.size(0), values.size(0))
     << "The sizes of key/value are not equal! keys_size: " << keys.size(0)
     << "values_size: " << values.size(0);
   std::vector<size_t> idx(keys.size(0));
   std::vector<KDType> keys_vec(keys.size(0));
   std::vector<VDType> values_vec(values.size(0));
   for (int i = 0; i < keys.size(0); i++) {
     idx[i] = i;
     keys_vec[i] = keys[i];
     values_vec[i] = values[i];
   }
   if (is_ascend) {
     std::stable_sort(idx.begin(), idx.end(),
                      [&keys_vec](size_t i1, size_t i2)
                        {return keys_vec[i1] < keys_vec[i2]; });
   } else {
     std::stable_sort(idx.begin(), idx.end(),
                      [&keys_vec](size_t i1, size_t i2)
                        {return keys_vec[i1] > keys_vec[i2]; });
   }
   for (index_t i = 0; i < values.size(0); i++) {
     keys[i] = keys_vec[idx[i]];
     values[i] = values_vec[idx[i]];
   }
 }

 template<typename Device, typename VDType, typename SDType>
 inline void VectorizedSort(Tensor<Device, 1, VDType> values, Tensor<Device, 1, SDType> segments) {
   // We can sort each segments using two stable sorts
   SortByKey(values, segments, true);
   SortByKey(segments, values, true);
 }

 // blas related
 template<typename Device, typename DType>
 inline void VectorDot(Tensor<Device, 1, DType> dst,
                       const Tensor<Device, 1, DType> &lhs,
                       const Tensor<Device, 1, DType> &rhs) {
   CHECK_EQ(lhs.size(0), rhs.size(0))
       << "VectorDot: Shape mismatch";
   CHECK_EQ(dst.size(0), 1U)
       << "VectorDot: expect dst to be scalar";
   expr::BLASEngine<Device, DType>::SetStream(lhs.stream_);
   mshadow::expr::BLASEngine<Device, DType>::dot(
       lhs.stream_, lhs.size(0), lhs.dptr_, 1, rhs.dptr_, 1, dst.dptr_);
 }

 template<bool transpose_left, bool transpose_right, typename Device, typename DType>
 inline void BatchGEMM(Tensor<Device, 3, DType> dst,
                       const Tensor<Device, 3, DType> &lhs,
                       const Tensor<Device, 3, DType> &rhs,
                       DType alpha,
                       DType beta,
                       Tensor<Device, 1, DType*> workspace) {
   index_t batch_size = dst.shape_[0];
   expr::BLASEngine<Device, DType>::SetStream(dst.stream_);
   Shape<3> sleft = transpose_left ? Shape3(lhs.shape_[0], lhs.shape_[2], lhs.shape_[1])
     : lhs.shape_;
   Shape<3> sright = transpose_right ? Shape3(rhs.shape_[0], rhs.shape_[2], rhs.shape_[1])
     : rhs.shape_;
   CHECK_EQ(dst.CheckContiguous(), true);
   CHECK_EQ(lhs.CheckContiguous(), true);
   CHECK_EQ(rhs.CheckContiguous(), true);
   CHECK(sleft[0] == batch_size && sright[0] == batch_size)
     << "BatchGEMM: batchsize must be equal."
     << "dst: " << dst.shape_ << "\n"
     << "lhs: " << sleft << "\n"
     << "rhs: " << sright << "\n";
   CHECK(dst.size(1) == sleft[1] && dst.size(2) == sright[2] && sleft[2] == sright[1])
     << "BatchGEMM: matrix shape mismatch"
     << "dst: " << dst.shape_ << "\n"
     << "lhs: " << sleft << "\n"
     << "rhs: " << sright << "\n";
   CHECK(workspace.size(0) >= 3 * batch_size)
     << "Workspace Size must be bigger than " << 3 * batch_size;
   CHECK_EQ(workspace.CheckContiguous(), true);
   // use column major argument to compatible with most BLAS
   expr::BLASEngine<Device, DType>::batched_gemm
     (dst.stream_,
     transpose_right, transpose_left,
     transpose_right ? rhs.size(1) : rhs.size(2),
     transpose_left ? lhs.size(2) : lhs.size(1),
     transpose_right ? rhs.size(2) : rhs.size(1),
     alpha,
     rhs.dptr_, rhs.stride_,
     lhs.dptr_, lhs.stride_,
     beta,
     dst.dptr_, dst.stride_, batch_size,
     workspace.dptr_);
 }
 }  // namespace mshadow
 #endif  // MSHADOW_TENSOR_CPU_INL_H_
mshadow::VectorDot
void VectorDot(Tensor< Device, 1, DType > dst, const Tensor< Device, 1, DType > &lhs, const Tensor< Device, 1, DType > &rhs)
CPU/GPU: 1 dimension vector dot.
Definition: tensor_cpu-inl.h:597

mshadow::expr::BLASEngine::batched_gemm
static void batched_gemm(Stream< Device > *stream, bool transa, bool transb, int m, int n, int k, DType alpha, const DType *A, int lda, const DType *B, int ldb, DType beta, DType *C, int ldc, int batch_count, DType **workspace)
Definition: dot_engine-inl.h:91

mshadow::FreeSpace
void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:140

mshadow::ShutdownTensorEngine< cpu >
void ShutdownTensorEngine< cpu >(void)
Definition: tensor_cpu-inl.h:41

mshadow::Tensor< Device, 1, DType >::stream_
Stream< Device > * stream_
Definition: tensor.h:574

mshadow::IndexFill
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:547

mshadow::SoftmaxGrad
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:306

mshadow::SmoothSoftmaxGrad
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:323

mshadow::expr::pad
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
Definition: pad.h:71

mshadow::BatchGEMM
void BatchGEMM(Tensor< Device, 3, DType > dst, const Tensor< Device, 3, DType > &lhs, const Tensor< Device, 3, DType > &rhs, DType alpha, DType beta, Tensor< Device, 1, DType *> workspace)
CPU/GPU: dst = alpha * op(lhs) op(rhs) + beta * dst.
Definition: tensor_cpu-inl.h:610

mshadow::Tensor::dptr_
DType * dptr_
pointer to the data
Definition: tensor.h:434

mshadow::FreeHost_
void FreeHost_(void *dptr)

mshadow::TRValue
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:409

mshadow::expr::Exp< Container, DType, type::kRValue >::self
const Container & self(void) const
Definition: expression.h:82

mshadow::expr::Plan
Definition: expr_engine-inl.h:58

mshadow::SetDevice< cpu >
void SetDevice< cpu >(int devid)
Definition: tensor_cpu-inl.h:45

mshadow::expr::TypeCheckPass
used to help static type check
Definition: expr_engine-inl.h:330

mshadow::packet::AlignedFree
void AlignedFree(void *ptr)
free aligned space
Definition: packet-inl.h:106

mshadow::Copy
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:145

mshadow::Shape< dim >

mshadow::Tensor::MSize
MSHADOW_XINLINE index_t MSize(void) const
Definition: tensor.h:497

mshadow::MapExp
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:207

mshadow::expr::Exp< Container, DType, type::kRValue >::ptrself
Container * ptrself(void)
Definition: expression.h:86

mshadow::Tensor::shape_
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:436

mshadow::expr::PacketAlignCheck
Definition: packet-inl.h:379

mshadow::Shape4
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:240

mshadow::SortByKey
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:558

mshadow::Softmax
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:483

mshadow::VectorizedSort
void VectorizedSort(Tensor< Device, 1, VDType > values, Tensor< Device, 1, SDType > segments)
CPU/GPU: Sort the keys within each segment. (Stable sort is performed!) Segments is defined as an asc...
Definition: tensor_cpu-inl.h:589

mshadow::packet::AlignedMallocPitch
void * AlignedMallocPitch(size_t *out_pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: packet-inl.h:77

MSHADOW_CUDA_CALL
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:278

mshadow::MapReduceKeepLowest
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:223

mshadow::expr::ShapeCheck::Check
static Shape< dim > Check(const E &t)

tensor.h
header file of tensor data structure and functions This lib requires explicit memory allocation and d...

mshadow::cpu
device name CPU
Definition: tensor.h:39

mshadow::expr::Plan::Eval
MSHADOW_XINLINE DType Eval(index_t y, index_t x) const
evaluate the expression at index [y][x] to be implemented by SubType, for RValue, the return type wil...

mshadow::AllocHost_
void * AllocHost_(size_t size)

mshadow::Tensor< Device, 1, DType >::size
MSHADOW_XINLINE index_t size(index_t i) const
Definition: tensor.h:606

mshadow::FreeHost_< cpu >
void FreeHost_< cpu >(void *dptr)
Definition: tensor_cpu-inl.h:95

mshadow::index_t
int32_t index_t
type that will be used for index
Definition: base.h:343

mshadow::AllocSpace
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116

mshadow::Tensor< Device, 1, DType >::dptr_
DType * dptr_
Definition: tensor.h:571

mshadow::Tensor::FlatTo2D
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:519

packet-inl.h
Generic packet vectorization code.

mshadow::InitTensorEngine< cpu >
void InitTensorEngine< cpu >(int dev_id)
Definition: tensor_cpu-inl.h:38

mshadow::Tensor::size
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:505

mshadow::AddTakeGradLargeBatch
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:537

mshadow::Tensor::CheckContiguous
MSHADOW_XINLINE bool CheckContiguous(void) const
Definition: tensor.h:491

mshadow::expr::BLASEngine::dot
static void dot(Stream< Device > *stream, int n, const DType *X, int incX, const DType *Y, int incY, DType *ret)
Definition: dot_engine-inl.h:125

mshadow::AllocHost
void AllocHost(Tensor< cpu, dim, DType > *obj)
Definition: tensor_cpu-inl.h:100

mshadow::expr::ShapeCheck
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:364

mshadow::NewStream< cpu >
Stream< cpu > * NewStream< cpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: tensor_cpu-inl.h:48

mshadow::MapPlan
void MapPlan(TRValue< R, cpu, dim, DType > *dst, const expr::Plan< E, DType > &plan)
Definition: tensor_cpu-inl.h:163

mshadow::MapExpCPUEngine
Definition: tensor_cpu-inl.h:182

mshadow::expr::ScalarExp
scalar expression
Definition: expression.h:95

mshadow::MapReduceKeepHighDim
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:250

mshadow::AllocHost_< cpu >
void * AllocHost_< cpu >(size_t size)
Definition: tensor_cpu-inl.h:90

mshadow::NewTensor
Tensor< Device, dim, DType > NewTensor(const Shape< dim > &shape, DType initv, bool pad=MSHADOW_ALLOC_PAD, Stream< Device > *stream=NULL)
CPU/GPU: short cut to allocate and initialize a Tensor.
Definition: tensor_cpu-inl.h:132

mshadow::expr::Exp
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:79

mshadow::expr::MakePlan
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239

mshadow::AddTakeGrad
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:516

mshadow::Tensor< Device, 1, DType >
Definition: tensor.h:568

mshadow::Shape3
MSHADOW_XINLINE Shape< 3 > Shape3(index_t s0, index_t s1, index_t s2)
construct a three dimension shape, stride will equal s0
Definition: tensor.h:227

mshadow
overloaded + operator between half_t and bf16_t
Definition: base.h:334

mshadow::FreeHost
void FreeHost(Tensor< cpu, dim, DType > *obj)
Definition: tensor_cpu-inl.h:107

mshadow::Tensor::stride_
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:441

MSHADOW_DEFAULT_PACKET
#define MSHADOW_DEFAULT_PACKET
Definition: packet-inl.h:47

mshadow::Tensor
general tensor
Definition: tensor.h:420

base.h

mshadow::expr::BLASEngine::SetStream
static void SetStream(Stream< Device > *stream)
Definition: dot_engine-inl.h:82

mshadow::DeleteStream< cpu >
void DeleteStream< cpu >(Stream< cpu > *stream)
Definition: tensor_cpu-inl.h:54

mshadow::MapExpCPUEngine< true, SV, Tensor< cpu, dim, DType >, dim, DType, E, etype >::Map
static void Map(Tensor< cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
Definition: tensor_cpu-inl.h:192

mshadow::openmp_index_t
index_t openmp_index_t
openmp index for linux
Definition: base.h:351

mshadow::Tensor::stream_
Stream< Device > * stream_
stream where the computation lies stream is a device dependency concept where each computation ...
Definition: tensor.h:446

dot_engine-inl.h
definitions of how Matrix Multiplications can be evaluated

mshadow::MapExpCPUEngine::Map
static void Map(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
Definition: tensor_cpu-inl.h:183

mshadow::Stream
computaion stream structure, used for asynchronous computations
Definition: tensor.h:383