mxnet
packet-inl.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MSHADOW_PACKET_INL_H_
26 #define MSHADOW_PACKET_INL_H_
27 
28 #if defined(__APPLE__) || defined(__FreeBSD__)
29 #include <stdlib.h>
30 #else
31 #include <malloc.h>
32 #endif
33 #include "./base.h"
34 #include "./tensor.h"
35 #include "./expression.h"
36 
37 
38 namespace mshadow {
40 namespace packet {
41 
42 enum PacketArch {
45 };
46 
47 #if MSHADOW_USE_SSE
48 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kSSE2
49 #else
50 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kPlain
51 #endif
52 
53 // whether packet operator is enabled.
59 template<typename DType, PacketArch Arch = MSHADOW_DEFAULT_PACKET>
60 struct Packet;
61 
62 template<PacketArch Arch>
63 struct AlignBytes {
64  static const index_t value = 4;
65 };
66 
67 } // namespace packet
68 } // namespace mshadow
69 
70 namespace mshadow {
71 namespace packet {
78 inline void* AlignedMallocPitch(size_t *out_pitch,
79  size_t lspace,
80  size_t num_line) {
82  const index_t mask = (1 << bits) - 1;
83 
84  size_t pitch = ((lspace + mask) >> bits) << bits;
85  *out_pitch = pitch;
86 #ifdef _MSC_VER
87  void *res = _aligned_malloc(pitch * num_line, 1 << bits);
88 #else
89  void *res;
90  int ret = posix_memalign(&res, 1 << bits, pitch * num_line);
91  CHECK_EQ(ret, 0) << "AlignedMallocPitch failed";
92 #endif
93  if (res == NULL) {
94  LOG(FATAL) << "AlignedMallocPitch failed";
95  }
96 #if __GNUC__ >= 6
97 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
98 #endif
99  return res;
100 #pragma GCC diagnostic pop
101 }
102 
107 inline void AlignedFree(void *ptr) {
108 #ifdef _MSC_VER
109  _aligned_free(ptr);
110 #else
111  free(ptr);
112 #endif
113 }
114 
116 template<PacketArch Arch>
117 inline bool CheckAlign(size_t pitch) {
118  const index_t bits = AlignBytes<Arch>::value;
119  return !(pitch & ((1 << bits) - 1));
120 }
121 
123 template<PacketArch Arch>
124 inline bool CheckAlign(void *ptr) {
125  return CheckAlign<Arch>(reinterpret_cast<size_t>(ptr));
126 }
127 
133 template<typename DType, PacketArch Arch>
134 inline index_t UpperAlign(index_t size) {
136  const index_t mask = (1 << bits) - 1;
137  const index_t fsize = sizeof(DType);
138  return (((size * fsize + mask) >> bits) << bits) / fsize;
139 }
140 
146 template<typename DType, PacketArch Arch>
147 inline index_t LowerAlign(index_t size) {
149  const index_t fsize = sizeof(DType);
150  return (((size * fsize) >> bits) << bits) / fsize;
151 }
152 
159 template<typename OP, typename DType, PacketArch Arch>
160 struct PacketOp {
161  static const bool kEnabled = false;
162 };
163 // specialization of operators
164 template<typename DType, PacketArch Arch>
165 struct PacketOp<op::plus, DType, Arch> {
166  static const bool kEnabled = true;
168  const Packet<DType, Arch>& rhs) {
169  return lhs + rhs;
170  }
171 };
172 template<typename DType, PacketArch Arch>
173 struct PacketOp<op::minus, DType, Arch> {
174  static const bool kEnabled = true;
176  const Packet<DType, Arch>& rhs) {
177  return lhs - rhs;
178  }
179 };
180 template<typename DType, PacketArch Arch>
181 struct PacketOp<op::mul, DType, Arch> {
182  static const bool kEnabled = true;
184  const Packet<DType, Arch>& rhs) {
185  return lhs * rhs;
186  }
187 };
188 template<typename DType, PacketArch Arch>
189 struct PacketOp<op::div, DType, Arch> {
190  static const bool kEnabled = true;
192  const Packet<DType, Arch>& rhs) {
193  return lhs / rhs;
194  }
195 };
196 
197 template<typename DType, PacketArch Arch>
198 struct PacketOp<op::identity, DType, Arch> {
199  static const bool kEnabled = true;
201  return src;
202  }
203 };
204 
205 
206 // savers to do storage
207 template<typename SV, typename TFloat, PacketArch Arch>
208 struct Saver{
209  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
212  ans.Store(dst);
213  }
214 };
215 template<typename TFloat, PacketArch Arch>
216 struct Saver<sv::saveto, TFloat, Arch> {
217  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
218  src.Store(dst);
219  }
220 };
221 } // namespace packet
222 } // namespace mshadow
223 
224 #include "packet/plain-inl.h"
225 #if MSHADOW_USE_SSE && !defined(__CUDACC__)
226 #include "packet/sse-inl.h"
227 #endif
228 
229 namespace mshadow {
230 namespace expr {
231 
233 
234 // same as plan, but use packet
235 template<typename ExpType, typename DType, PacketArch Arch>
236 class PacketPlan {
237  public:
243  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
244 };
245 
246 template <typename Device, int dim, typename DType, PacketArch Arch>
247 class PacketPlan<Tensor<Device, dim, DType>, DType, Arch> {
248  public:
250  :dptr_(t.dptr_), stride_(t.stride_) {}
252  return packet::Packet<DType, Arch>::Load(&dptr_[y * stride_ + x]);
253  }
254  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
255  return dptr_[y * stride_ + x];
256  }
257 
258  private:
259  const DType *dptr_;
260  index_t stride_;
261 };
262 
263 template<typename DType, PacketArch Arch>
264 class PacketPlan<ScalarExp<DType>, DType, Arch> {
265  public:
266  explicit PacketPlan(DType scalar) : scalar_(scalar) {}
268  return packet::Packet<DType, Arch>::Fill(scalar_);
269  }
270  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
271  return scalar_;
272  }
273 
274  private:
275  DType scalar_;
276 };
277 
278 template<typename OP, typename TA, typename TB, int etype, typename DType, PacketArch Arch>
279 class PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch> {
280  public:
282  : lhs_(lhs), rhs_(rhs) {}
284  return packet::PacketOp<OP, DType, Arch>::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x));
285  }
286  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
287  return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
288  }
289 
290  private:
293 };
294 
295 template<typename OP, typename TA, int etype, typename DType, PacketArch Arch>
296 class PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch> {
297  public:
298  PacketPlan(const PacketPlan<TA, DType, Arch> &src) : src_(src) {}
300  return packet::PacketOp<OP, DType, Arch>::Map(src_.EvalPacket(y, x));
301  }
302  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
303  return OP::Map(src_.Eval(y, x));
304  }
305 
306  private:
308 };
309 
310 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
313 
314 template<PacketArch Arch, typename DType>
316  return PacketPlan<ScalarExp<DType>, DType, Arch>(e.scalar_);
317 }
318 template<PacketArch Arch, typename T, typename DType>
320  return PacketPlan<T, DType, Arch>(e.self());
321 }
322 template<PacketArch Arch, typename T, int dim, typename DType>
326 }
327 template<PacketArch Arch, typename OP, typename TA, typename DType, int etype>
330  return PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>(MakePacketPlan<Arch>(e.src_));
331 }
332 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
333 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
335  return PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
336  DType, Arch>(MakePacketPlan<Arch>(e.lhs_), MakePacketPlan<Arch>(e.rhs_));
337 }
338 
346 template<typename E, PacketArch Arch>
347 struct PacketCheck{
348  static const bool kPass = false;
349 };
350 template<PacketArch Arch>
351 struct PacketCheck<float, Arch> {
352  static const bool kPass = true;
353 };
354 template<PacketArch Arch>
355 struct PacketCheck<double, Arch> {
356  static const bool kPass = true;
357 };
358 template<typename DType, PacketArch Arch>
359 struct PacketCheck<ScalarExp<DType>, Arch> {
360  static const bool kPass = PacketCheck<DType, Arch>::kPass;
361 };
362 template<int dim, typename DType, PacketArch Arch>
363 struct PacketCheck<Tensor<cpu, dim, DType>, Arch> {
364  static const bool kPass = PacketCheck<DType, Arch>::kPass;
365 };
366 template<typename OP, typename TA, typename DType, int etype, PacketArch Arch>
367 struct PacketCheck<UnaryMapExp<OP, TA, DType, etype>, Arch> {
368  static const bool kPass = PacketCheck<TA, Arch>::kPass &&
370 };
371 template<typename OP, typename TA, typename TB, typename DType, int etype, PacketArch Arch>
372 struct PacketCheck< BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
373  static const bool kPass = packet::PacketOp<OP, DType, Arch>::kEnabled &&
375 };
376 //----------------------------------------------------
377 // Check if data is aligned and allow packet operation
378 //----------------------------------------------------
379 template<int dim, typename E, PacketArch Arch>
381  inline static bool Check(const E &exp) {
382  return false;
383  }
384 };
385 template<int dim, typename DType, PacketArch Arch>
386 struct PacketAlignCheck<dim, ScalarExp<DType>, Arch> {
387  inline static bool Check(const ScalarExp<DType> &exp) {
388  return true;
389  }
390 };
391 template<int dim, typename DType, PacketArch Arch>
392 struct PacketAlignCheck<dim, Tensor<cpu, dim, DType>, Arch> {
393  inline static bool Check(const Tensor<cpu, dim, DType> &t) {
394  return packet::CheckAlign<Arch>(t.dptr_) &&
395  packet::CheckAlign<Arch>(t.stride_ * sizeof(DType));
396  }
397 };
398 template<int dim, typename OP, typename TA, typename DType, int etype, PacketArch Arch>
399 struct PacketAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype>, Arch> {
400  inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
402  }
403 };
404 template<int dim, typename OP, typename TA, typename TB,
405  typename DType, int etype, PacketArch Arch>
406 struct PacketAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
407  inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
410  }
411 };
412 
416 template<typename SV, typename E, int dim, typename DType, PacketArch Arch>
418  const expr::PacketPlan<E, DType, Arch>& plan) {
419  Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
420  const index_t xlen = packet::LowerAlign<DType, Arch>(dst.size(1));
421  const size_t packetSize = packet::Packet<DType, Arch>::size;
422 #ifndef __CUDACC__
423  #pragma omp parallel for
424 #endif
425  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
426  for (index_t x = 0; x < xlen; x += packetSize) {
427  packet::Saver<SV, DType, Arch>::Save(&dst[y][x], plan.EvalPacket(y, x));
428  }
429  for (index_t x = xlen; x < dst.size(1); ++x) {
430  SV::Save(dst[y][x], plan.Eval(y, x));
431  }
432  }
433 }
434 } // namespace expr
435 } // namespace mshadow
436 #endif // MSHADOW_PACKET_INL_H_
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:191
ScalarExp< DType > scalar(DType s)
create an scalar expression
Definition: expression.h:104
Definition: packet-inl.h:236
DType * dptr_
pointer to the data
Definition: tensor.h:435
const Container & self(void) const
Definition: expression.h:83
void AlignedFree(void *ptr)
free aligned space
Definition: packet-inl.h:107
const TB & rhs_
right operand
Definition: expression.h:340
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:286
Definition: packet-inl.h:380
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:302
static bool Check(const E &exp)
Definition: packet-inl.h:381
binary map expression lhs [op] rhs
Definition: expression.h:335
static const index_t value
Definition: packet-inl.h:64
PacketPlan< UnaryMapExp< OP, TA, DType, etype >, DType, Arch > MakePacketPlan(const UnaryMapExp< OP, TA, DType, etype > &e)
Definition: packet-inl.h:329
void * AlignedMallocPitch(size_t *out_pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: packet-inl.h:78
Definition: packet-inl.h:43
base class of all rvalues
Definition: expression.h:149
DType scalar_
scalar value
Definition: expression.h:98
PacketArch
Definition: packet-inl.h:42
PacketPlan(const PacketPlan< TA, DType, Arch > &lhs, const PacketPlan< TB, DType, Arch > &rhs)
Definition: packet-inl.h:281
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &src)
Definition: packet-inl.h:200
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
device name CPU
Definition: tensor.h:40
MaskExp< IndexExp, SrcExp, DType > mask(const Exp< IndexExp, DType, e1 > &index, const Exp< SrcExp, DType, e2 > &src)
Definition: mask.h:58
definitions of abstract expressions and expressions template
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:254
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:175
int32_t index_t
type that will be used for index
Definition: base.h:336
PacketPlan(const PacketPlan< TA, DType, Arch > &src)
Definition: packet-inl.h:298
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:270
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:183
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:520
static bool Check(const BinaryMapExp< OP, TA, TB, DType, etype > &t)
Definition: packet-inl.h:407
Definition: packet-inl.h:44
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:506
support of sse2 packet optimization of some operations
static MSHADOW_CINLINE void Save(TFloat *dst, const Packet< TFloat, Arch > &src)
Definition: packet-inl.h:217
generic Packet operator
Definition: packet-inl.h:160
MSHADOW_CINLINE packet::Packet< DType > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:299
PacketPlan(DType scalar)
Definition: packet-inl.h:266
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:267
PacketPlan(const Tensor< Device, dim, DType > &t)
Definition: packet-inl.h:249
bool CheckAlign(size_t pitch)
check if a pointer is aligned
Definition: packet-inl.h:117
index_t LowerAlign(index_t size)
get lower bound of aligned index of size
Definition: packet-inl.h:147
Definition: packet-inl.h:63
const TA & src_
source expression
Definition: expression.h:408
#define MSHADOW_CINLINE
cpu force inline
Definition: base.h:226
index_t UpperAlign(index_t size)
get upper bound of aligned index of size
Definition: packet-inl.h:134
unary map expression op(src)
Definition: expression.h:405
scalar expression
Definition: expression.h:96
Definition: packet-inl.h:208
const SubType & real_self(void) const
true self of subtype
Definition: expr_engine-inl.h:50
static MSHADOW_CINLINE void Save(TFloat *dst, const Packet< TFloat, Arch > &src)
Definition: packet-inl.h:209
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:167
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:283
a general class that allows extension that makes tensors of some shape
Definition: expr_engine-inl.h:44
const TA & lhs_
left operand
Definition: expression.h:338
overloaded + operator between half_t and bf16_t
Definition: base.h:327
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
evaluate the expression at index [y][x], x will be aligned to Packet<DType, Arch>::Size() ...
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:442
static bool Check(const UnaryMapExp< OP, TA, DType, etype > &t)
Definition: packet-inl.h:400
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:251
general tensor
Definition: tensor.h:421
static bool Check(const Tensor< cpu, dim, DType > &t)
Definition: packet-inl.h:393
void MapPacketPlan(Tensor< cpu, dim, DType > _dst, const expr::PacketPlan< E, DType, Arch > &plan)
use PacketPlan to compute result
Definition: packet-inl.h:417
support of plain packet that use the plain datatype.
Generic packet type.
Definition: packet-inl.h:60
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
static bool Check(const ScalarExp< DType > &exp)
Definition: packet-inl.h:387
index_t openmp_index_t
openmp index for linux
Definition: base.h:344
static check packet enable
Definition: packet-inl.h:347