IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
CastReg.h
Go to the documentation of this file.
1 // CastReg.h:
3 // ------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__
17 
22 
23 namespace ipsdk {
24 namespace simd {
25 namespace detail {
26 
27 IPSDK_FORCEINLINE
28 __m256 _custom_mm256_cvtepu32_ps(const __m256i& v)
29 {
30  const __m256 two16 = _mm256_set1_ps(65536.0f); // 2**16 constant
31 
32  // Avoid double rounding by doing two exact conversions
33  // of high and low 16-bit segments
34  const __m256i hi = _mm256_srli_epi32(v, 16);
35  const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
36  const __m256 fHi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi), two16);
37  //const __m256 fHi = _mm256_cvtepi32_ps(hi);
38  const __m256 fLo = _mm256_cvtepi32_ps(lo);
39 
40  // do single rounding according to current rounding mode
41  return _mm256_add_ps(fHi, fLo);
42 }
43 
44 IPSDK_FORCEINLINE
45 void _custom_mm256_cvtepu32_ps(const __m256i& v, __m256& out)
46 {
47  const __m256 two16 = _mm256_set1_ps(65536.0f); // 2**16 constant
48 
49  // Avoid double rounding by doing two exact conversions
50  // of high and low 16-bit segments
51  const __m256i hi = _mm256_srli_epi32(v, 16);
52  const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
53  const __m256 fHi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi), two16);
54  //const __m256 fHi = _mm256_cvtepi32_ps(hi);
55  const __m256 fLo = _mm256_cvtepi32_ps(lo);
56 
57  // do single rounding according to current rounding mode
58  out = _mm256_add_ps(fHi, fLo);
59 }
60 
61 IPSDK_FORCEINLINE
62 __m256i _custom_mm256_cvttps_epu32(const __m256& f)
63 {
64  const __m256 two31 = _mm256_set1_ps(2147483648.0f); // 2**31 constant
65 
66  // check for overflow before conversion to int
67  const __m256 overflow = _mm256_cmp_ps(f, two31, _CMP_GT_OQ);
68 
69  // bias the value to signed space if it is >= 2**31
70  __m256 result_ps = _mm256_sub_ps(f, _mm256_and_ps(overflow, two31));
71 
72  // clip at zero
73  result_ps = _mm256_max_ps(result_ps, _mm256_set1_ps(.0f));
74 
75  // convert to int with saturation
76  __m256i result = _mm256_cvttps_epi32(result_ps); // rounding mode should be round to nearest
77 
78  // unbias
79  return _mm256_add_epi32(result, _mm256_cvttps_epi32(overflow));
80 
81  // patch up the overflow case
82  //result = _mm_or_si128(result, (__m128i)overflow2);
83 }
84 
85 IPSDK_FORCEINLINE
86 void _custom_mm256_cvttps_epu32(const __m256& f, __m256i& out)
87 {
88  const __m256 two31 = _mm256_set1_ps(2147483648.0f); // 2**31 constant
89 
90  // check for overflow before conversion to int
91  const __m256 overflow = _mm256_cmp_ps(f, two31, _CMP_GT_OQ);
92 
93  // bias the value to signed space if it is >= 2**31
94  __m256 result_ps = _mm256_sub_ps(f, _mm256_and_ps(overflow, two31));
95 
96  // clip at zero
97  result_ps = _mm256_max_ps(result_ps, _mm256_set1_ps(.0f));
98 
99  // convert to int with saturation
100  __m256i result = _mm256_cvttps_epi32(result_ps); // rounding mode should be round to nearest
101 
102  // unbias
103  out = _mm256_add_epi32(result, _mm256_cvttps_epi32(overflow));
104 
105  // patch up the overflow case
106  //result = _mm_or_si128(result, (__m128i)overflow2);
107 }
108 
109 IPSDK_FORCEINLINE
110 __m256d _custom_mm256_cvtepu32_pd(const __m256i& v)
111 {
112  const __m256d two16 = _mm256_set1_pd(65536.0f); // 2**16 constant
113 
114  // Avoid double rounding by doing two exact conversions
115  // of high and low 16-bit segments
116  const __m256i hi = _mm256_srli_epi32(v, 16);
117  const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
118  const __m256d fHi = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(hi)), two16);
119  //const __m256 fHi = _mm256_cvtepi32_ps(hi);
120  const __m256d fLo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(lo));
121 
122  // do single rounding according to current rounding mode
123  return _mm256_add_pd(fHi, fLo);
124 }
125 
126 IPSDK_FORCEINLINE
127 void _custom_mm256_cvtepu32_pd(const __m256i& v, __m256d& out)
128 {
129  const __m256d two16 = _mm256_set1_pd(65536.0f); // 2**16 constant
130 
131  // Avoid double rounding by doing two exact conversions
132  // of high and low 16-bit segments
133  const __m256i hi = _mm256_srli_epi32(v, 16);
134  const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
135  const __m256d fHi = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(hi)), two16);
136  //const __m256 fHi = _mm256_cvtepi32_ps(hi);
137  const __m256d fLo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(lo));
138 
139  // do single rounding according to current rounding mode
140  out = _mm256_add_pd(fHi, fLo);
141 }
142 
143 IPSDK_FORCEINLINE
144 __m128i _custom_mm256_cvttpd_epu32(const __m256d& f)
145 {
146  const __m256d two31 = _mm256_set1_pd(2147483648.0f); // 2**31 constant
147 
148  // check for overflow before conversion to int
149  const __m256d overflow = _mm256_cmp_pd(f, two31, _CMP_GT_OQ);
150 
151  // bias the value to signed space if it is >= 2**31
152  __m256d result_pd = _mm256_sub_pd(f, _mm256_and_pd(overflow, two31));
153 
154  // clip at zero
155  result_pd = _mm256_max_pd(result_pd, _mm256_set1_pd(.0f));
156 
157  // convert to int with saturation
158  const __m128i result = _mm256_cvttpd_epi32(result_pd); // rounding mode should be round to nearest
159 
160  // unbias
161  const __m128i overflowi = _mm256_cvttpd_epi32(overflow);
162 
163  return _mm_add_epi32(result, overflowi);
164 
165  // patch up the overflow case
166  //result = _mm_or_si128(result, (__m128i)overflow2);
167 }
168 
169 IPSDK_FORCEINLINE
170 void _custom_mm256_cvttpd_epu32(const __m256d& f, __m128i& out)
171 {
172  const __m256d two31 = _mm256_set1_pd(2147483648.0f); // 2**31 constant
173 
174  // check for overflow before conversion to int
175  const __m256d overflow = _mm256_cmp_pd(f, two31, _CMP_GT_OQ);
176 
177  // bias the value to signed space if it is >= 2**31
178  __m256d result_pd = _mm256_sub_pd(f, _mm256_and_pd(overflow, two31));
179 
180  // clip at zero
181  result_pd = _mm256_max_pd(result_pd, _mm256_set1_pd(.0f));
182 
183  // convert to int with saturation
184  const __m128i result = _mm256_cvttpd_epi32(result_pd); // rounding mode should be round to nearest
185 
186  // unbias
187  const __m128i overflowi = _mm256_cvttpd_epi32(overflow);
188 
189  out = _mm_add_epi32(result, overflowi);
190 
191  // patch up the overflow case
192  //result = _mm_or_si128(result, (__m128i)overflow2);
193 }
194 
197 
200 template <typename TIn, typename TOut>
201 struct CastReg<eInstructionSet::eIS_Avx2, TIn, TOut,
202  typename boost::enable_if_c<
203  boost::is_same<TIn, TOut>::value || (boost::is_integral<TIn>::value
204  && boost::is_integral<TOut>::value && sizeof(TIn)==sizeof(TOut))
205  >::type
206 >
207 {
208  static IPSDK_FORCEINLINE
209  void act(const typename AvxType<TIn>::Type& in,
210  typename AvxType<TOut>::Type& out)
211  {
212  out = in;
213  }
214 };
215 
218 template <typename TOut>
220  typename boost::enable_if<
221  typename boost::mpl::equal_to<
222  boost::mpl::int_<sizeof(TOut)>,
223  boost::mpl::int_<2>
224  >::type
225  >::type
226 >
227 {
228  static IPSDK_FORCEINLINE
229  void act(const AvxType<ipUInt8>::Type& in,
230  typename AvxType<TOut>::Type& outl,
231  typename AvxType<TOut>::Type& outh)
232  {
233  __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
234  outl = _mm256_unpacklo_epi8(inPermuted, _mm256_set1_epi8(0));
235  outh = _mm256_unpackhi_epi8(inPermuted, _mm256_set1_epi8(0));
236  }
237 };
238 
241 template <typename TOut>
243  typename boost::enable_if_c<sizeof(TOut)==2>::type>
244 {
245  static IPSDK_FORCEINLINE
246  void act(const AvxType<ipInt8>::Type& in,
247  typename AvxType<TOut>::Type& outl,
248  typename AvxType<TOut>::Type& outh)
249  {
250  const int permuteMask = _MM_SHUFFLE(3, 1, 2, 0);
251  __m256i inPermuted = _mm256_permute4x64_epi64(in, permuteMask);
252  outl = _mm256_unpacklo_epi8(inPermuted, inPermuted);
253  outl = _mm256_srai_epi16(outl, 8);
254  outh = _mm256_unpackhi_epi8(inPermuted, inPermuted);
255  outh = _mm256_srai_epi16(outh, 8);
256  }
257 };
258 
261 template <>
263 {
264  static IPSDK_FORCEINLINE
265  void act(const AvxType<ipInt32>::Type& in,
267  {
268  out = _mm256_cvtepi32_ps(in);
269  }
270 };
271 
274 template <>
276 {
277  static IPSDK_FORCEINLINE
278  void act(const AvxType<ipUInt32>::Type& in,
280  {
281  out = _custom_mm256_cvtepu32_ps(in);
282  }
283 };
284 
287 template <>
289 {
290  static IPSDK_FORCEINLINE
291  void act(const AvxType<ipReal32>::Type& in,
293  {
294  out = _mm256_cvttps_epi32(in);
295  }
296 };
297 
300 template <>
302 {
303  static IPSDK_FORCEINLINE
304  void act(const AvxType<ipReal32>::Type& in,
306  {
307  _custom_mm256_cvttps_epu32(in, out);
308  }
309 };
310 
313 template <>
315 {
316  static IPSDK_FORCEINLINE
317  void act(const AvxType<ipInt32>::Type& in,
320  {
321  outl = _mm256_cvtepi32_pd(_mm256_castsi256_si128(in));
322  outh = _mm256_cvtepi32_pd(_mm256_extractf128_si256(in, 1));
323  }
324 };
325 
328 template <>
330 {
331  static IPSDK_FORCEINLINE
332  void act(const AvxType<ipUInt32>::Type& in,
335  {
336  outl = _custom_mm256_cvtepu32_pd(in);
337  AvxType<ipUInt32>::Type inShift = _mm256_setzero_si256();
338  inShift = _mm256_inserti128_si256(inShift, _mm256_extractf128_si256(in, 1), 0);
339  _mm256_srli_si256(in, 8);
340  outh = _custom_mm256_cvtepu32_pd(inShift);
341  }
342 };
343 
346 template <>
348 {
349  static IPSDK_FORCEINLINE
350  void act(const AvxType<ipReal64>::Type& inl,
351  const AvxType<ipReal64>::Type& inh,
353  {
354  const __m128i outl = _mm256_cvttpd_epi32(inl);
355  const __m128i outh = _mm256_cvttpd_epi32(inh);
356  out = _mm256_inserti128_si256(out, outl, 0);
357  out = _mm256_inserti128_si256(out, outh, 1);
358  }
359 };
360 
363 template <>
365 {
366  static IPSDK_FORCEINLINE
367  void act(const AvxType<ipReal64>::Type& inl,
368  const AvxType<ipReal64>::Type& inh,
370  {
371  const __m128i outl = _custom_mm256_cvttpd_epu32(inl);
372  const __m128i outh = _custom_mm256_cvttpd_epu32(inh);
373  out = _mm256_inserti128_si256(out, outl, 0);
374  out = _mm256_inserti128_si256(out, outh, 1);
375  }
376 };
377 
380 template <typename TOut>
382  typename boost::enable_if_c<sizeof(TOut)==8 &&
383  boost::is_integral<TOut>::value>::type>
384 {
385  static IPSDK_FORCEINLINE
386  void act(const AvxType<ipInt32>::Type& in,
387  typename AvxType<TOut>::Type& outl,
388  typename AvxType<TOut>::Type& outh)
389  {
390  const AvxType<ipInt32>::Type inPermuted =
391  _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
392 
393  const AvxType<ipInt32>::Type zero = _mm256_setzero_si256();
396  outl = _mm256_unpacklo_epi32(inPermuted, hi);
397  outh = _mm256_unpackhi_epi32(inPermuted, hi);
398  }
399 };
400 
403 template <typename TOut>
405  typename boost::enable_if_c<sizeof(TOut)==8 &&
406  boost::is_integral<TOut>::value>::type>
407 {
408  static IPSDK_FORCEINLINE
409  void act(const AvxType<ipUInt32>::Type& in,
410  typename AvxType<TOut>::Type& outl,
411  typename AvxType<TOut>::Type& outh)
412  {
413  const AvxType<ipInt32>::Type inPermuted =
414  _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
415  const AvxType<ipUInt32>::Type zero = _mm256_setzero_si256();
416  outl = _mm256_unpacklo_epi32(inPermuted, zero);
417  outh = _mm256_unpackhi_epi32(inPermuted, zero);
418  }
419 };
420 
423 template <typename TOut>
425  typename boost::enable_if_c<sizeof(TOut)==4
426  && boost::is_integral<TOut>::value>::type>
427 {
428  static IPSDK_FORCEINLINE
429  void act(const AvxType<ipInt16>::Type& in,
430  typename AvxType<TOut>::Type& outl,
431  typename AvxType<TOut>::Type& outh)
432  {
433  __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
434 
435  outl = _mm256_unpacklo_epi16(inPermuted, inPermuted);
436  outh = _mm256_unpackhi_epi16(inPermuted, inPermuted);
437  outl = _mm256_srai_epi32(outl, 16);
438  outh = _mm256_srai_epi32(outh, 16);
439  }
440 };
441 
444 template <typename TOut>
446  typename boost::enable_if_c<sizeof(TOut)==4 &&
447  boost::is_integral<TOut>::value>::type>
448 {
449  static IPSDK_FORCEINLINE
450  void act(const AvxType<ipUInt16>::Type& in,
451  typename AvxType<TOut>::Type& outl,
452  typename AvxType<TOut>::Type& outh)
453  {
454  __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
455 
456  outl = _mm256_unpacklo_epi16(inPermuted, _mm256_set1_epi16(0));
457  outh = _mm256_unpackhi_epi16(inPermuted, _mm256_set1_epi16(0));
458  }
459 };
460 
463 template <typename TIn>
465  typename boost::enable_if_c<sizeof(TIn)==2 &&
466  boost::is_integral<TIn>::value>::type>
467 {
468  static IPSDK_FORCEINLINE
469  void act(const typename AvxType<TIn>::Type& in,
472  {
473  AvxType<ipInt32>::Type in32l, in32h;
477  }
478 };
479 
482 template <>
484 {
485  static IPSDK_FORCEINLINE
486  void act(const AvxType<ipReal32>::Type& in,
489  {
490  outl = _mm256_cvtps_pd(_mm256_extractf128_ps(in, 0));
491  outh = _mm256_cvtps_pd(_mm256_extractf128_ps(in, 1));
492  }
493 };
494 
498 template <typename TIn>
500  typename boost::enable_if_c<sizeof(TIn)==2>::type
501 >
502 {
503  static IPSDK_FORCEINLINE
504  void act(const typename AvxType<TIn>::Type& inl,
505  const typename AvxType<TIn>::Type& inh,
507  {
508  out = _mm256_packus_epi16(inl, inh);
509  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
510  }
511 };
512 
515 template <typename TIn>
517  typename boost::enable_if_c<sizeof(TIn)==2>::type>
518 {
519  static IPSDK_FORCEINLINE
520  void act(const typename AvxType<TIn>::Type& inl,
521  const typename AvxType<TIn>::Type& inh,
523  {
524  out = _mm256_packs_epi16(inl, inh);
525  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
526  }
527 };
528 
531 template <typename TIn>
533  typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
534 {
535  static IPSDK_FORCEINLINE
536  void act(const typename AvxType<TIn>::Type& inl,
537  const typename AvxType<TIn>::Type& inh,
539  {
540  out = _mm256_packs_epi32(inl, inh);
541  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
542  }
543 };
544 
547 template <typename TIn>
549  typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
550 {
551  static IPSDK_FORCEINLINE
552  void act(const typename AvxType<TIn>::Type& inl,
553  const typename AvxType<TIn>::Type& inh,
555  {
556  out = _mm256_packus_epi32(inl, inh);
557  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
558  }
559 };
560 
563 template <typename TOut>
565  typename boost::enable_if_c<sizeof(TOut)==2>::type>
566 {
567  static IPSDK_FORCEINLINE
568  void act(const AvxType<ipReal32>::Type& inl,
569  const AvxType<ipReal32>::Type& inh,
570  typename AvxType<TOut>::Type& out)
571  {
572  AvxType<ipInt32>::Type inlInt32, inhInt32;
574  inl, inlInt32);
576  inh, inhInt32);
578  inlInt32, inhInt32, out);
579  }
580 };
581 
584 template <>
586 {
587  static IPSDK_FORCEINLINE
588  void act(const AvxType<ipReal64>::Type& inl,
589  const AvxType<ipReal64>::Type& inh,
591  {
592  out = _mm256_castps128_ps256(_mm256_cvtpd_ps(inl));
593  out = _mm256_insertf128_ps(out, _mm256_cvtpd_ps(inh), 1);
594  }
595 };
596 
597 /*
598 // TODO
599 template <typename TIn, typename TOut>
600 struct CastReg<eInstructionSet::eIS_Avx2, TIn, TOut,
601  typename boost::enable_if_c<sizeof(TIn)==2 && sizeof(TOut)==1 && (!boost::is_signed<TIn>::value || !boost::is_signed<TOut>::value)>::type>
602 {
603  static IPSDK_FORCEINLINE
604  void act(const typename AvxType<TIn>::Type& inl,
605  const typename AvxType<TIn>::Type& inh,
606  typename AvxType<TOut>::Type& out)
607  {
608  out = _mm256_packus_epi16(inl, inh);
609  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
610  }
611 };
612 
613 // TODO
614 template <>
615 struct CastReg<eInstructionSet::eIS_Avx2, ipInt16, ipInt8>
616 {
617  static IPSDK_FORCEINLINE
618  void act(const AvxType<ipInt16>::Type& inl,
619  const AvxType<ipInt16>::Type& inh,
620  AvxType<ipInt16>::Type& out)
621  {
622  out = _mm256_packs_epi16(inl, inh);
623  out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
624  }
625 };*/
626 
629 
630 } // end of namespace detail
631 } // end of namespace simd
632 } // end of namespace ipsdk
633 
634 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__
int8_t ipInt8
Base types definition.
Definition: BaseTypes.h:48
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
Definition: CastReg.h:30
int16_t ipInt16
Base types definition.
Definition: BaseTypes.h:50
uint8_t ipUInt8
Base types definition.
Definition: BaseTypes.h:49
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Advanced Vector Extensions 2.
Definition: InstructionSetTypes.h:48
Definition of import/export macro for library.
structure used to retrieve AVX type associated to a base type
Definition: AvxTypes.h:33
Definition: IsLessRegDecl.h:30
uint16_t ipUInt16
Base types definition.
Definition: BaseTypes.h:51
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53