IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
CastReg.h
Go to the documentation of this file.
1 // CastReg.h:
3 // ------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__
17 
21 
22 #include <boost/mpl/and.hpp>
23 #include <boost/mpl/equal_to.hpp>
24 #include <boost/mpl/int.hpp>
25 #include <boost/mpl/not.hpp>
26 #include <boost/mpl/or.hpp>
27 #include <boost/type_traits/is_signed.hpp>
28 
29 namespace ipsdk {
30 namespace simd {
31 namespace detail {
32 
33 IPSDK_FORCEINLINE __m128 _custom_mm_cvtepu32_ps(const __m128i v)
34 {
35  const __m128 two16 = _mm_set1_ps(static_cast<float>(1 << 16));
36 
37  // Avoid double rounding by doing two exact conversions
38  // of high and low 16-bit segments
39  const __m128i hi = _mm_srli_epi32(v, 16);
40  const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
41  const __m128 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
42  const __m128 fLo = _mm_cvtepi32_ps(lo);
43 
44  // do single rounding according to current rounding mode
45  return _mm_add_ps(fHi, fLo);
46 }
47 
48 IPSDK_FORCEINLINE void _custom_mm_cvtepu32_ps(const __m128i v, __m128& out)
49 {
50  const __m128 two16 = _mm_set1_ps(static_cast<float>(1 << 16));
51 
52  // Avoid double rounding by doing two exact conversions
53  // of high and low 16-bit segments
54  const __m128i hi = _mm_srli_epi32(v, 16);
55  const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
56  const __m128 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
57  const __m128 fLo = _mm_cvtepi32_ps(lo);
58 
59  // do single rounding according to current rounding mode
60  out = _mm_add_ps(fHi, fLo);
61 }
62 
63 IPSDK_FORCEINLINE __m128i _custom_mm_cvttps_epu32(__m128 f)
64 {
65  const __m128 two31 = _mm_set1_ps(
66  static_cast<float>(static_cast<unsigned int>(1) << 31));
67  //const __m128 two32 = _mm_add_ps(two31, two31);
68  const __m128 zero = _mm_xor_ps(f, f);
69 
70  // check for overflow before conversion to int
71  const __m128 overflow = _mm_cmpge_ps(f, two31);
72  //const __m128 overflow2 = _mm_cmpge_ps(f, two32);
73  const __m128 subval = _mm_and_ps(overflow, two31);
74  //const __m128i addval = _mm_slli_epi32((__m128i)overflow, 31);
75  __m128i addval = _mm_cvttps_epi32(overflow);
76  //addval = _mm_slli_epi32(addval, 31);
77  //const __m128i addval = _mm_slli_epi32(_mm_set1_epi32(1), 31);
78  __m128i result;
79 
80  // bias the value to signed space if it is >= 2**31
81  f = _mm_sub_ps(f, subval);
82 
83  // clip at zero
84  f = _mm_max_ps(f, zero);
85 
86  // convert to int with saturation
87  result = _mm_cvttps_epi32(f); // rounding mode should be round to nearest
88 
89  // unbias
90  result = _mm_add_epi32(result, addval);
91 
92  // patch up the overflow case
93  //result = _mm_or_si128(result, (__m128i)overflow2);
94 
95  return result;
96 }
97 
98 IPSDK_FORCEINLINE void _custom_mm_cvttps_epu32(__m128 f, __m128i& out)
99 {
100  const __m128 two31 = _mm_set1_ps(
101  static_cast<float>(static_cast<unsigned int>(1) << 31));
102  //const __m128 two32 = _mm_add_ps(two31, two31);
103  const __m128 zero = _mm_xor_ps(f, f);
104 
105  // check for overflow before conversion to int
106  const __m128 overflow = _mm_cmpge_ps(f, two31);
107  //const __m128 overflow2 = _mm_cmpge_ps(f, two32);
108  const __m128 subval = _mm_and_ps(overflow, two31);
109  //const __m128i addval = _mm_slli_epi32((__m128i)overflow, 31);
110  __m128i addval = _mm_cvttps_epi32(overflow);
111  //addval = _mm_slli_epi32(addval, 31);
112  //const __m128i addval = _mm_slli_epi32(_mm_set1_epi32(1), 31);
113  __m128i result;
114 
115  // bias the value to signed space if it is >= 2**31
116  f = _mm_sub_ps(f, subval);
117 
118  // clip at zero
119  f = _mm_max_ps(f, zero);
120 
121  // convert to int with saturation
122  result = _mm_cvttps_epi32(f); // rounding mode should be round to nearest
123 
124  // unbias
125  out = _mm_add_epi32(result, addval);
126 
127  // patch up the overflow case
128  //result = _mm_or_si128(result, (__m128i)overflow2);
129 }
130 
131 IPSDK_FORCEINLINE __m128d _custom_mm_cvtepu32_pd(const __m128i v)
132 {
133  const __m128d two16 = _mm_set1_pd(static_cast<ipReal64>(1 << 16));
134 
135  // Avoid double rounding by doing two exact conversions
136  // of high and low 16-bit segments
137  const __m128i hi = _mm_srli_epi32(v, 16);
138  const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
139  const __m128d fHi = _mm_mul_pd(_mm_cvtepi32_pd(hi), two16);
140  const __m128d fLo = _mm_cvtepi32_pd(lo);
141 
142  // do single rounding according to current rounding mode
143  return _mm_add_pd(fHi, fLo);
144 }
145 
146 IPSDK_FORCEINLINE void _custom_mm_cvtepu32_pd(const __m128i v, __m128d& out)
147 {
148  const __m128d two16 = _mm_set1_pd(static_cast<ipReal64>(1 << 16));
149 
150  // Avoid double rounding by doing two exact conversions
151  // of high and low 16-bit segments
152  const __m128i hi = _mm_srli_epi32(v, 16);
153  const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
154  const __m128d fHi = _mm_mul_pd(_mm_cvtepi32_pd(hi), two16);
155  const __m128d fLo = _mm_cvtepi32_pd(lo);
156 
157  // do single rounding according to current rounding mode
158  out = _mm_add_pd(fHi, fLo);
159 }
160 
161 IPSDK_FORCEINLINE __m128i _custom_mm_cvttpd_epu32(__m128d f)
162 {
163  const __m128d two31 = _mm_set1_pd(
164  static_cast<ipReal64>(static_cast<unsigned int>(1) << 31));
165  //const __m128 two32 = _mm_add_ps(two31, two31);
166  const __m128d zero = _mm_xor_pd(f, f);
167 
168  // check for overflow before conversion to int
169  const __m128d overflow = _mm_cmpge_pd(f, two31);
170  //const __m128 overflow2 = _mm_cmpge_ps(f, two32);
171  const __m128d subval = _mm_and_pd(overflow, two31);
172  //const __m128i addval = _mm_slli_epi32((__m128i)overflow, 31);
173  __m128i addval = _mm_cvttpd_epi32(overflow);
174  //addval = _mm_slli_epi32(addval, 31);
175  //const __m128i addval = _mm_slli_epi32(_mm_set1_epi32(1), 31);
176  __m128i result;
177 
178  // bias the value to signed space if it is >= 2**31
179  f = _mm_sub_pd(f, subval);
180 
181  // clip at zero
182  f = _mm_max_pd(f, zero);
183 
184  // convert to int with saturation
185  result = _mm_cvttpd_epi32(f); // rounding mode should be round to nearest
186 
187  // unbias
188  result = _mm_add_epi32(result, addval);
189 
190  // patch up the overflow case
191  //result = _mm_or_si128(result, (__m128i)overflow2);
192 
193  return result;
194 }
195 
196 IPSDK_FORCEINLINE void _custom_mm_cvttpd_epu32(__m128d f, __m128i& out)
197 {
198  const __m128d two31 = _mm_set1_pd(
199  static_cast<ipReal64>(static_cast<unsigned int>(1) << 31));
200  //const __m128 two32 = _mm_add_ps(two31, two31);
201  const __m128d zero = _mm_xor_pd(f, f);
202 
203  // check for overflow before conversion to int
204  const __m128d overflow = _mm_cmpge_pd(f, two31);
205  //const __m128 overflow2 = _mm_cmpge_ps(f, two32);
206  const __m128d subval = _mm_and_pd(overflow, two31);
207  //const __m128i addval = _mm_slli_epi32((__m128i)overflow, 31);
208  __m128i addval = _mm_cvttpd_epi32(overflow);
209  //addval = _mm_slli_epi32(addval, 31);
210  //const __m128i addval = _mm_slli_epi32(_mm_set1_epi32(1), 31);
211  __m128i result;
212 
213  // bias the value to signed space if it is >= 2**31
214  f = _mm_sub_pd(f, subval);
215 
216  // clip at zero
217  f = _mm_max_pd(f, zero);
218 
219  // convert to int with saturation
220  result = _mm_cvttpd_epi32(f); // rounding mode should be round to nearest
221 
222  // unbias
223  out = _mm_add_epi32(result, addval);
224 
225  // patch up the overflow case
226  //result = _mm_or_si128(result, (__m128i)overflow2);
227 }
228 
229 IPSDK_FORCEINLINE __m128i _custom_mm_packus_epi32(__m128i v0, __m128i v1)
230 {
231  v0 = _mm_slli_epi32 (v0, 16);
232  v0 = _mm_srai_epi32 (v0, 16);
233  v1 = _mm_slli_epi32 (v1, 16);
234  v1 = _mm_srai_epi32 (v1, 16);
235  return _mm_packs_epi32 (v0, v1);
236 }
237 
238 IPSDK_FORCEINLINE void _custom_mm_packus_epi32(__m128i v0, __m128i v1, __m128i& out)
239 {
240  v0 = _mm_slli_epi32 (v0, 16);
241  v0 = _mm_srai_epi32 (v0, 16);
242  v1 = _mm_slli_epi32 (v1, 16);
243  v1 = _mm_srai_epi32 (v1, 16);
244  out = _mm_packs_epi32 (v0, v1);
245 }
246 
249 
252 template <typename TIn, typename TOut>
253 struct CastReg<eInstructionSet::eIS_Sse2, TIn, TOut,
254  typename boost::enable_if_c<
255  boost::is_same<TIn, TOut>::value || (boost::is_integral<TIn>::value
256  && boost::is_integral<TOut>::value && sizeof(TIn)==sizeof(TOut))
257  >::type
258 >
259 {
260  static IPSDK_FORCEINLINE
261  void act(const typename Sse2Type<TIn>::Type& in,
262  typename Sse2Type<TOut>::Type& out)
263  {
264  out = in;
265  }
266 };
267 
270 template <typename TOut>
272  typename boost::enable_if<
273  typename boost::mpl::equal_to<
274  boost::mpl::int_<sizeof(TOut)>,
275  boost::mpl::int_<2>
276  >::type
277  >::type
278 >
279 {
280  static IPSDK_FORCEINLINE
281  void act(const Sse2Type<ipUInt8>::Type& in,
282  typename Sse2Type<TOut>::Type& outl,
283  typename Sse2Type<TOut>::Type& outh)
284  {
285  outl = _mm_unpacklo_epi8(in, _mm_set1_epi8(0));
286  outh = _mm_unpackhi_epi8(in, _mm_set1_epi8(0));
287  }
288 };
289 
292 template <typename TOut>
294  typename boost::enable_if_c<sizeof(TOut)==2>::type>
295 {
296 
297  static IPSDK_FORCEINLINE
298  void act(const Sse2Type<ipInt8>::Type& in,
299  typename Sse2Type<TOut>::Type& outl,
300  typename Sse2Type<TOut>::Type& outh)
301  {
302  outl = _mm_unpacklo_epi8(in, in);
303  outl = _mm_srai_epi16(outl, 8);
304  outh = _mm_unpackhi_epi8(in, in);
305  outh = _mm_srai_epi16(outh, 8);
306  }
307 };
308 
311 template <>
313 {
314  static IPSDK_FORCEINLINE
315  void act(const Sse2Type<ipInt32>::Type& in,
317  {
318  out = _mm_cvtepi32_ps(in);
319  }
320 };
321 
324 template <>
326 {
327  static IPSDK_FORCEINLINE
328  void act(const Sse2Type<ipUInt32>::Type& in,
330  {
331  out = _custom_mm_cvtepu32_ps(in);
332  }
333 };
334 
337 template <>
339 {
340  static IPSDK_FORCEINLINE
341  void act(const Sse2Type<ipReal32>::Type& in,
343  {
344  out = _mm_cvttps_epi32(in);
345  }
346 };
347 
350 template <>
352 {
353  static IPSDK_FORCEINLINE
354  void act(const Sse2Type<ipReal32>::Type& in,
356  {
357  _custom_mm_cvttps_epu32(in, out);
358  }
359 };
360 
363 template <>
365 {
366  static IPSDK_FORCEINLINE
367  void act(const Sse2Type<ipInt32>::Type& in,
370  {
371  outl = _mm_cvtepi32_pd(in);
372  outh = _mm_cvtepi32_pd(_mm_srli_si128(in, 8));
373  }
374 };
375 
378 template <>
380 {
381  static IPSDK_FORCEINLINE
382  void act(const Sse2Type<ipUInt32>::Type& in,
385  {
386  /*ipUInt32 bufUInt32[4];
387  ipReal64 bufReal64[4];
388 
389  _mm_storeu_si128(
390  reinterpret_cast<Sse2Type<ipUInt32>::Type*>(bufUInt32), in);
391 
392  for(ipUInt32 i=0; i<4; ++i)
393  bufReal64[i] = bufUInt32[i];
394  outl = _mm_loadu_pd(bufReal64);
395  outh = _mm_loadu_pd(bufReal64+2);*/
396  outl = _custom_mm_cvtepu32_pd(in);
397  outh = _custom_mm_cvtepu32_pd(_mm_srli_si128(in, 8));
398  }
399 };
400 
403 template <>
405 {
406  static IPSDK_FORCEINLINE
407  void act(const Sse2Type<ipReal64>::Type& inl,
408  const Sse2Type<ipReal64>::Type& inh,
410  {
411  Sse2Type<ipInt32>::Type outl = _mm_cvttpd_epi32(inl);
412  Sse2Type<ipInt32>::Type outh = _mm_cvttpd_epi32(inh);
413  out = _mm_or_si128(outl, _mm_slli_si128(outh, 8));
414  }
415 };
416 
419 template <>
421 {
422  static IPSDK_FORCEINLINE
423  void act(const Sse2Type<ipReal64>::Type& inl,
424  const Sse2Type<ipReal64>::Type& inh,
426  {
427  Sse2Type<ipInt32>::Type outl, outh;
428  _custom_mm_cvttpd_epu32(inl, outl);
429  _custom_mm_cvttpd_epu32(inh, outh);
430  out = _mm_or_si128(outl, _mm_slli_si128(outh, 8));
431  }
432 };
433 
436 template <typename TOut>
438  typename boost::enable_if_c<sizeof(TOut)==8 &&
439  boost::is_integral<TOut>::value>::type>
440 {
441  static IPSDK_FORCEINLINE
442  void act(const Sse2Type<ipInt32>::Type& in,
443  typename Sse2Type<TOut>::Type& outl,
444  typename Sse2Type<TOut>::Type& outh)
445  {
446  const Sse2Type<ipInt32>::Type zero = _mm_setzero_si128();
447  const Sse2Type<ipInt32>::Type hi =
448  _mm_cmplt_epi32(in, zero);
449  outl = _mm_unpacklo_epi32(in, hi);
450  outh = _mm_unpackhi_epi32(in, hi);
451  }
452 };
453 
456 template <typename TOut>
458  typename boost::enable_if_c<sizeof(TOut)==8 &&
459  boost::is_integral<TOut>::value>::type>
460 {
461  static IPSDK_FORCEINLINE
462  void act(const Sse2Type<ipUInt32>::Type& in,
463  typename Sse2Type<TOut>::Type& outl,
464  typename Sse2Type<TOut>::Type& outh)
465  {
466  outl = _mm_unpacklo_epi32(in, _mm_set1_epi32(0));
467  outh = _mm_unpackhi_epi32(in, _mm_set1_epi32(0));
468  }
469 };
470 
473 template <typename TOut>
475  typename boost::enable_if_c<sizeof(TOut)==4
476  && boost::is_integral<TOut>::value>::type>
477 {
478  static IPSDK_FORCEINLINE
479  void act(const Sse2Type<ipInt16>::Type& in,
480  typename Sse2Type<TOut>::Type& outl,
481  typename Sse2Type<TOut>::Type& outh)
482  {
483  outl = _mm_unpacklo_epi16(in, in);
484  outh = _mm_unpackhi_epi16(in, in);
485  outl = _mm_srai_epi32(outl, 16);
486  outh = _mm_srai_epi32(outh, 16);
487  }
488 };
489 
492 template <typename TOut>
494  typename boost::enable_if_c<sizeof(TOut)==4 &&
495  boost::is_integral<TOut>::value>::type>
496 {
497  static IPSDK_FORCEINLINE
498  void act(const Sse2Type<ipUInt16>::Type& in,
499  typename Sse2Type<TOut>::Type& outl,
500  typename Sse2Type<TOut>::Type& outh)
501  {
502  outl = _mm_unpacklo_epi16(in, _mm_set1_epi16(0));
503  outh = _mm_unpackhi_epi16(in, _mm_set1_epi16(0));
504  }
505 };
506 
509 template <typename TIn>
511  typename boost::enable_if_c<sizeof(TIn)==2 &&
512  boost::is_integral<TIn>::value>::type>
513 {
514  static IPSDK_FORCEINLINE
515  void act(const typename Sse2Type<TIn>::Type& in,
518  {
519  Sse2Type<ipInt32>::Type in32l, in32h;
523  }
524 };
525 
528 template <>
530 {
531  static IPSDK_FORCEINLINE
532  void act(const Sse2Type<ipReal32>::Type& in,
535  {
536  outl = _mm_cvtps_pd(in);
537  outh = _mm_cvtps_pd(_mm_shuffle_ps(in, in, _MM_SHUFFLE(3, 2, 3, 2)));
538  }
539 };
540 
544 template <typename TIn>
546  typename boost::enable_if_c<sizeof(TIn)==2>::type
547 >
548 {
549  static IPSDK_FORCEINLINE
550  void act(const typename Sse2Type<TIn>::Type& inl,
551  const typename Sse2Type<TIn>::Type& inh,
553  {
554  out = _mm_packus_epi16(inl, inh);
555  }
556 };
557 
560 template <typename TIn>
562  typename boost::enable_if_c<sizeof(TIn)==2>::type>
563 {
564  static IPSDK_FORCEINLINE
565  void act(const typename Sse2Type<TIn>::Type& inl,
566  const typename Sse2Type<TIn>::Type& inh,
568  {
569  out = _mm_packs_epi16(inl, inh);
570  }
571 };
572 
575 template <typename TIn>
577  typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
578 {
579  static IPSDK_FORCEINLINE
580  void act(const typename Sse2Type<TIn>::Type& inl,
581  const typename Sse2Type<TIn>::Type& inh,
583  {
584  out = _mm_packs_epi32(inl, inh);
585  }
586 };
587 
590 template <typename TIn>
592  typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
593 {
594  static IPSDK_FORCEINLINE
595  void act(const typename Sse2Type<TIn>::Type& inl,
596  const typename Sse2Type<TIn>::Type& inh,
598  {
599  out = _custom_mm_packus_epi32(inl, inh);
600  }
601 };
602 
605 template <typename TOut>
607  typename boost::enable_if_c<sizeof(TOut)==2>::type>
608 {
609  static IPSDK_FORCEINLINE
610  void act(const Sse2Type<ipReal32>::Type& inl,
611  const Sse2Type<ipReal32>::Type& inh,
612  typename Sse2Type<TOut>::Type& out)
613  {
614  Sse2Type<ipInt32>::Type inlInt32, inhInt32;
616  inl, inlInt32);
618  inh, inhInt32);
620  inlInt32, inhInt32, out);
621  }
622 };
623 
626 template <>
628 {
629  static IPSDK_FORCEINLINE
630  void act(const Sse2Type<ipReal64>::Type& inl,
631  const Sse2Type<ipReal64>::Type& inh,
633  {
634  out = _mm_movelh_ps(_mm_cvtpd_ps(inl), _mm_cvtpd_ps(inh));
635  }
636 };
637 
640 
641 } // end of namespace detail
642 } // end of namespace simd
643 } // end of namespace ipsdk
644 
645 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__
int8_t ipInt8
Base types definition.
Definition: BaseTypes.h:48
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
Definition: CastReg.h:30
int16_t ipInt16
Base types definition.
Definition: BaseTypes.h:50
uint8_t ipUInt8
Base types definition.
Definition: BaseTypes.h:49
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition of import/export macro for library.
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
structure used to retrieve SSE2 type associated to a base type
Definition: Sse2Types.h:32
uint16_t ipUInt16
Base types definition.
Definition: BaseTypes.h:51
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53