15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__ 16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__ 22 #include <boost/mpl/and.hpp> 23 #include <boost/mpl/equal_to.hpp> 24 #include <boost/mpl/int.hpp> 25 #include <boost/mpl/not.hpp> 26 #include <boost/mpl/or.hpp> 27 #include <boost/type_traits/is_signed.hpp> 33 IPSDK_FORCEINLINE __m128 _custom_mm_cvtepu32_ps(
const __m128i v)
35 const __m128 two16 = _mm_set1_ps(static_cast<float>(1 << 16));
39 const __m128i hi = _mm_srli_epi32(v, 16);
40 const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
41 const __m128 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
42 const __m128 fLo = _mm_cvtepi32_ps(lo);
45 return _mm_add_ps(fHi, fLo);
48 IPSDK_FORCEINLINE
void _custom_mm_cvtepu32_ps(
const __m128i v, __m128& out)
50 const __m128 two16 = _mm_set1_ps(static_cast<float>(1 << 16));
54 const __m128i hi = _mm_srli_epi32(v, 16);
55 const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
56 const __m128 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
57 const __m128 fLo = _mm_cvtepi32_ps(lo);
60 out = _mm_add_ps(fHi, fLo);
63 IPSDK_FORCEINLINE __m128i _custom_mm_cvttps_epu32(__m128 f)
65 const __m128 two31 = _mm_set1_ps(
66 static_cast<float>(static_cast<unsigned int>(1) << 31));
68 const __m128 zero = _mm_xor_ps(f, f);
71 const __m128 overflow = _mm_cmpge_ps(f, two31);
73 const __m128 subval = _mm_and_ps(overflow, two31);
75 __m128i addval = _mm_cvttps_epi32(overflow);
81 f = _mm_sub_ps(f, subval);
84 f = _mm_max_ps(f, zero);
87 result = _mm_cvttps_epi32(f);
90 result = _mm_add_epi32(result, addval);
98 IPSDK_FORCEINLINE
void _custom_mm_cvttps_epu32(__m128 f, __m128i& out)
100 const __m128 two31 = _mm_set1_ps(
101 static_cast<float>(static_cast<unsigned int>(1) << 31));
103 const __m128 zero = _mm_xor_ps(f, f);
106 const __m128 overflow = _mm_cmpge_ps(f, two31);
108 const __m128 subval = _mm_and_ps(overflow, two31);
110 __m128i addval = _mm_cvttps_epi32(overflow);
116 f = _mm_sub_ps(f, subval);
119 f = _mm_max_ps(f, zero);
122 result = _mm_cvttps_epi32(f);
125 out = _mm_add_epi32(result, addval);
131 IPSDK_FORCEINLINE __m128d _custom_mm_cvtepu32_pd(
const __m128i v)
133 const __m128d two16 = _mm_set1_pd(static_cast<ipReal64>(1 << 16));
137 const __m128i hi = _mm_srli_epi32(v, 16);
138 const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
139 const __m128d fHi = _mm_mul_pd(_mm_cvtepi32_pd(hi), two16);
140 const __m128d fLo = _mm_cvtepi32_pd(lo);
143 return _mm_add_pd(fHi, fLo);
146 IPSDK_FORCEINLINE
void _custom_mm_cvtepu32_pd(
const __m128i v, __m128d& out)
148 const __m128d two16 = _mm_set1_pd(static_cast<ipReal64>(1 << 16));
152 const __m128i hi = _mm_srli_epi32(v, 16);
153 const __m128i lo = _mm_srli_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
154 const __m128d fHi = _mm_mul_pd(_mm_cvtepi32_pd(hi), two16);
155 const __m128d fLo = _mm_cvtepi32_pd(lo);
158 out = _mm_add_pd(fHi, fLo);
161 IPSDK_FORCEINLINE __m128i _custom_mm_cvttpd_epu32(__m128d f)
163 const __m128d two31 = _mm_set1_pd(
164 static_cast<ipReal64>(static_cast<unsigned int>(1) << 31));
166 const __m128d zero = _mm_xor_pd(f, f);
169 const __m128d overflow = _mm_cmpge_pd(f, two31);
171 const __m128d subval = _mm_and_pd(overflow, two31);
173 __m128i addval = _mm_cvttpd_epi32(overflow);
179 f = _mm_sub_pd(f, subval);
182 f = _mm_max_pd(f, zero);
185 result = _mm_cvttpd_epi32(f);
188 result = _mm_add_epi32(result, addval);
196 IPSDK_FORCEINLINE
void _custom_mm_cvttpd_epu32(__m128d f, __m128i& out)
198 const __m128d two31 = _mm_set1_pd(
199 static_cast<ipReal64>(static_cast<unsigned int>(1) << 31));
201 const __m128d zero = _mm_xor_pd(f, f);
204 const __m128d overflow = _mm_cmpge_pd(f, two31);
206 const __m128d subval = _mm_and_pd(overflow, two31);
208 __m128i addval = _mm_cvttpd_epi32(overflow);
214 f = _mm_sub_pd(f, subval);
217 f = _mm_max_pd(f, zero);
220 result = _mm_cvttpd_epi32(f);
223 out = _mm_add_epi32(result, addval);
229 IPSDK_FORCEINLINE __m128i _custom_mm_packus_epi32(__m128i v0, __m128i v1)
231 v0 = _mm_slli_epi32 (v0, 16);
232 v0 = _mm_srai_epi32 (v0, 16);
233 v1 = _mm_slli_epi32 (v1, 16);
234 v1 = _mm_srai_epi32 (v1, 16);
235 return _mm_packs_epi32 (v0, v1);
238 IPSDK_FORCEINLINE
void _custom_mm_packus_epi32(__m128i v0, __m128i v1, __m128i& out)
240 v0 = _mm_slli_epi32 (v0, 16);
241 v0 = _mm_srai_epi32 (v0, 16);
242 v1 = _mm_slli_epi32 (v1, 16);
243 v1 = _mm_srai_epi32 (v1, 16);
244 out = _mm_packs_epi32 (v0, v1);
252 template <
typename TIn,
typename TOut>
254 typename boost::enable_if_c<
255 boost::is_same<TIn, TOut>::value || (boost::is_integral<TIn>::value
256 && boost::is_integral<TOut>::value && sizeof(TIn)==sizeof(TOut))
260 static IPSDK_FORCEINLINE
270 template <
typename TOut>
272 typename boost::enable_if<
273 typename boost::mpl::equal_to<
274 boost::mpl::int_<sizeof(TOut)>,
280 static IPSDK_FORCEINLINE
285 outl = _mm_unpacklo_epi8(in, _mm_set1_epi8(0));
286 outh = _mm_unpackhi_epi8(in, _mm_set1_epi8(0));
292 template <
typename TOut>
294 typename boost::enable_if_c<sizeof(TOut)==2>::type>
297 static IPSDK_FORCEINLINE
302 outl = _mm_unpacklo_epi8(in, in);
303 outl = _mm_srai_epi16(outl, 8);
304 outh = _mm_unpackhi_epi8(in, in);
305 outh = _mm_srai_epi16(outh, 8);
314 static IPSDK_FORCEINLINE
318 out = _mm_cvtepi32_ps(in);
327 static IPSDK_FORCEINLINE
331 out = _custom_mm_cvtepu32_ps(in);
340 static IPSDK_FORCEINLINE
344 out = _mm_cvttps_epi32(in);
353 static IPSDK_FORCEINLINE
357 _custom_mm_cvttps_epu32(in, out);
366 static IPSDK_FORCEINLINE
371 outl = _mm_cvtepi32_pd(in);
372 outh = _mm_cvtepi32_pd(_mm_srli_si128(in, 8));
381 static IPSDK_FORCEINLINE
396 outl = _custom_mm_cvtepu32_pd(in);
397 outh = _custom_mm_cvtepu32_pd(_mm_srli_si128(in, 8));
406 static IPSDK_FORCEINLINE
413 out = _mm_or_si128(outl, _mm_slli_si128(outh, 8));
422 static IPSDK_FORCEINLINE
428 _custom_mm_cvttpd_epu32(inl, outl);
429 _custom_mm_cvttpd_epu32(inh, outh);
430 out = _mm_or_si128(outl, _mm_slli_si128(outh, 8));
436 template <
typename TOut>
438 typename boost::enable_if_c<sizeof(TOut)==8 &&
439 boost::is_integral<TOut>::value>::type>
441 static IPSDK_FORCEINLINE
448 _mm_cmplt_epi32(in, zero);
449 outl = _mm_unpacklo_epi32(in, hi);
450 outh = _mm_unpackhi_epi32(in, hi);
456 template <
typename TOut>
458 typename boost::enable_if_c<sizeof(TOut)==8 &&
459 boost::is_integral<TOut>::value>::type>
461 static IPSDK_FORCEINLINE
466 outl = _mm_unpacklo_epi32(in, _mm_set1_epi32(0));
467 outh = _mm_unpackhi_epi32(in, _mm_set1_epi32(0));
473 template <
typename TOut>
475 typename boost::enable_if_c<sizeof(TOut)==4
476 && boost::is_integral<TOut>::value>::type>
478 static IPSDK_FORCEINLINE
483 outl = _mm_unpacklo_epi16(in, in);
484 outh = _mm_unpackhi_epi16(in, in);
485 outl = _mm_srai_epi32(outl, 16);
486 outh = _mm_srai_epi32(outh, 16);
492 template <
typename TOut>
494 typename boost::enable_if_c<sizeof(TOut)==4 &&
495 boost::is_integral<TOut>::value>::type>
497 static IPSDK_FORCEINLINE
502 outl = _mm_unpacklo_epi16(in, _mm_set1_epi16(0));
503 outh = _mm_unpackhi_epi16(in, _mm_set1_epi16(0));
509 template <
typename TIn>
511 typename boost::enable_if_c<sizeof(TIn)==2 &&
512 boost::is_integral<TIn>::value>::type>
514 static IPSDK_FORCEINLINE
531 static IPSDK_FORCEINLINE
536 outl = _mm_cvtps_pd(in);
537 outh = _mm_cvtps_pd(_mm_shuffle_ps(in, in, _MM_SHUFFLE(3, 2, 3, 2)));
544 template <
typename TIn>
546 typename boost::enable_if_c<sizeof(TIn)==2>::type
549 static IPSDK_FORCEINLINE
554 out = _mm_packus_epi16(inl, inh);
560 template <
typename TIn>
562 typename boost::enable_if_c<sizeof(TIn)==2>::type>
564 static IPSDK_FORCEINLINE
569 out = _mm_packs_epi16(inl, inh);
575 template <
typename TIn>
577 typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
579 static IPSDK_FORCEINLINE
584 out = _mm_packs_epi32(inl, inh);
590 template <
typename TIn>
592 typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
594 static IPSDK_FORCEINLINE
599 out = _custom_mm_packus_epi32(inl, inh);
605 template <
typename TOut>
607 typename boost::enable_if_c<sizeof(TOut)==2>::type>
609 static IPSDK_FORCEINLINE
620 inlInt32, inhInt32, out);
629 static IPSDK_FORCEINLINE
634 out = _mm_movelh_ps(_mm_cvtpd_ps(inl), _mm_cvtpd_ps(inh));
645 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_SSE2_CASTREG_H__
int8_t ipInt8
Base types definition.
Definition: BaseTypes.h:48
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
int16_t ipInt16
Base types definition.
Definition: BaseTypes.h:50
uint8_t ipUInt8
Base types definition.
Definition: BaseTypes.h:49
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition of import/export macro for library.
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
structure used to retrieve SSE2 type associated to a base type
Definition: Sse2Types.h:32
uint16_t ipUInt16
Base types definition.
Definition: BaseTypes.h:51
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53