15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__ 16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__ 28 __m256 _custom_mm256_cvtepu32_ps(
const __m256i& v)
30 const __m256 two16 = _mm256_set1_ps(65536.0f);
34 const __m256i hi = _mm256_srli_epi32(v, 16);
35 const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
36 const __m256 fHi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi), two16);
38 const __m256 fLo = _mm256_cvtepi32_ps(lo);
41 return _mm256_add_ps(fHi, fLo);
45 void _custom_mm256_cvtepu32_ps(
const __m256i& v, __m256& out)
47 const __m256 two16 = _mm256_set1_ps(65536.0f);
51 const __m256i hi = _mm256_srli_epi32(v, 16);
52 const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
53 const __m256 fHi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi), two16);
55 const __m256 fLo = _mm256_cvtepi32_ps(lo);
58 out = _mm256_add_ps(fHi, fLo);
62 __m256i _custom_mm256_cvttps_epu32(
const __m256& f)
64 const __m256 two31 = _mm256_set1_ps(2147483648.0f);
67 const __m256 overflow = _mm256_cmp_ps(f, two31, _CMP_GT_OQ);
70 __m256 result_ps = _mm256_sub_ps(f, _mm256_and_ps(overflow, two31));
73 result_ps = _mm256_max_ps(result_ps, _mm256_set1_ps(.0f));
76 __m256i result = _mm256_cvttps_epi32(result_ps);
79 return _mm256_add_epi32(result, _mm256_cvttps_epi32(overflow));
86 void _custom_mm256_cvttps_epu32(
const __m256& f, __m256i& out)
88 const __m256 two31 = _mm256_set1_ps(2147483648.0f);
91 const __m256 overflow = _mm256_cmp_ps(f, two31, _CMP_GT_OQ);
94 __m256 result_ps = _mm256_sub_ps(f, _mm256_and_ps(overflow, two31));
97 result_ps = _mm256_max_ps(result_ps, _mm256_set1_ps(.0f));
100 __m256i result = _mm256_cvttps_epi32(result_ps);
103 out = _mm256_add_epi32(result, _mm256_cvttps_epi32(overflow));
110 __m256d _custom_mm256_cvtepu32_pd(
const __m256i& v)
112 const __m256d two16 = _mm256_set1_pd(65536.0f);
116 const __m256i hi = _mm256_srli_epi32(v, 16);
117 const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
118 const __m256d fHi = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(hi)), two16);
120 const __m256d fLo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(lo));
123 return _mm256_add_pd(fHi, fLo);
127 void _custom_mm256_cvtepu32_pd(
const __m256i& v, __m256d& out)
129 const __m256d two16 = _mm256_set1_pd(65536.0f);
133 const __m256i hi = _mm256_srli_epi32(v, 16);
134 const __m256i lo = _mm256_srli_epi32(_mm256_slli_epi32(v, 16), 16);
135 const __m256d fHi = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(hi)), two16);
137 const __m256d fLo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(lo));
140 out = _mm256_add_pd(fHi, fLo);
144 __m128i _custom_mm256_cvttpd_epu32(
const __m256d& f)
146 const __m256d two31 = _mm256_set1_pd(2147483648.0f);
149 const __m256d overflow = _mm256_cmp_pd(f, two31, _CMP_GT_OQ);
152 __m256d result_pd = _mm256_sub_pd(f, _mm256_and_pd(overflow, two31));
155 result_pd = _mm256_max_pd(result_pd, _mm256_set1_pd(.0f));
158 const __m128i result = _mm256_cvttpd_epi32(result_pd);
161 const __m128i overflowi = _mm256_cvttpd_epi32(overflow);
163 return _mm_add_epi32(result, overflowi);
170 void _custom_mm256_cvttpd_epu32(
const __m256d& f, __m128i& out)
172 const __m256d two31 = _mm256_set1_pd(2147483648.0f);
175 const __m256d overflow = _mm256_cmp_pd(f, two31, _CMP_GT_OQ);
178 __m256d result_pd = _mm256_sub_pd(f, _mm256_and_pd(overflow, two31));
181 result_pd = _mm256_max_pd(result_pd, _mm256_set1_pd(.0f));
184 const __m128i result = _mm256_cvttpd_epi32(result_pd);
187 const __m128i overflowi = _mm256_cvttpd_epi32(overflow);
189 out = _mm_add_epi32(result, overflowi);
200 template <
typename TIn,
typename TOut>
202 typename boost::enable_if_c<
203 boost::is_same<TIn, TOut>::value || (boost::is_integral<TIn>::value
204 && boost::is_integral<TOut>::value && sizeof(TIn)==sizeof(TOut))
208 static IPSDK_FORCEINLINE
218 template <
typename TOut>
220 typename boost::enable_if<
221 typename boost::mpl::equal_to<
222 boost::mpl::int_<sizeof(TOut)>,
228 static IPSDK_FORCEINLINE
233 __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
234 outl = _mm256_unpacklo_epi8(inPermuted, _mm256_set1_epi8(0));
235 outh = _mm256_unpackhi_epi8(inPermuted, _mm256_set1_epi8(0));
241 template <
typename TOut>
243 typename boost::enable_if_c<sizeof(TOut)==2>::type>
245 static IPSDK_FORCEINLINE
250 const int permuteMask = _MM_SHUFFLE(3, 1, 2, 0);
251 __m256i inPermuted = _mm256_permute4x64_epi64(in, permuteMask);
252 outl = _mm256_unpacklo_epi8(inPermuted, inPermuted);
253 outl = _mm256_srai_epi16(outl, 8);
254 outh = _mm256_unpackhi_epi8(inPermuted, inPermuted);
255 outh = _mm256_srai_epi16(outh, 8);
264 static IPSDK_FORCEINLINE
268 out = _mm256_cvtepi32_ps(in);
277 static IPSDK_FORCEINLINE
281 out = _custom_mm256_cvtepu32_ps(in);
290 static IPSDK_FORCEINLINE
294 out = _mm256_cvttps_epi32(in);
303 static IPSDK_FORCEINLINE
307 _custom_mm256_cvttps_epu32(in, out);
316 static IPSDK_FORCEINLINE
321 outl = _mm256_cvtepi32_pd(_mm256_castsi256_si128(in));
322 outh = _mm256_cvtepi32_pd(_mm256_extractf128_si256(in, 1));
331 static IPSDK_FORCEINLINE
336 outl = _custom_mm256_cvtepu32_pd(in);
338 inShift = _mm256_inserti128_si256(inShift, _mm256_extractf128_si256(in, 1), 0);
339 _mm256_srli_si256(in, 8);
340 outh = _custom_mm256_cvtepu32_pd(inShift);
349 static IPSDK_FORCEINLINE
354 const __m128i outl = _mm256_cvttpd_epi32(inl);
355 const __m128i outh = _mm256_cvttpd_epi32(inh);
356 out = _mm256_inserti128_si256(out, outl, 0);
357 out = _mm256_inserti128_si256(out, outh, 1);
366 static IPSDK_FORCEINLINE
371 const __m128i outl = _custom_mm256_cvttpd_epu32(inl);
372 const __m128i outh = _custom_mm256_cvttpd_epu32(inh);
373 out = _mm256_inserti128_si256(out, outl, 0);
374 out = _mm256_inserti128_si256(out, outh, 1);
380 template <
typename TOut>
382 typename boost::enable_if_c<sizeof(TOut)==8 &&
383 boost::is_integral<TOut>::value>::type>
385 static IPSDK_FORCEINLINE
391 _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
396 outl = _mm256_unpacklo_epi32(inPermuted, hi);
397 outh = _mm256_unpackhi_epi32(inPermuted, hi);
403 template <
typename TOut>
405 typename boost::enable_if_c<sizeof(TOut)==8 &&
406 boost::is_integral<TOut>::value>::type>
408 static IPSDK_FORCEINLINE
414 _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
416 outl = _mm256_unpacklo_epi32(inPermuted, zero);
417 outh = _mm256_unpackhi_epi32(inPermuted, zero);
423 template <
typename TOut>
425 typename boost::enable_if_c<sizeof(TOut)==4
426 && boost::is_integral<TOut>::value>::type>
428 static IPSDK_FORCEINLINE
433 __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
435 outl = _mm256_unpacklo_epi16(inPermuted, inPermuted);
436 outh = _mm256_unpackhi_epi16(inPermuted, inPermuted);
437 outl = _mm256_srai_epi32(outl, 16);
438 outh = _mm256_srai_epi32(outh, 16);
444 template <
typename TOut>
446 typename boost::enable_if_c<sizeof(TOut)==4 &&
447 boost::is_integral<TOut>::value>::type>
449 static IPSDK_FORCEINLINE
454 __m256i inPermuted = _mm256_permute4x64_epi64(in, _MM_SHUFFLE(3, 1, 2, 0));
456 outl = _mm256_unpacklo_epi16(inPermuted, _mm256_set1_epi16(0));
457 outh = _mm256_unpackhi_epi16(inPermuted, _mm256_set1_epi16(0));
463 template <
typename TIn>
465 typename boost::enable_if_c<sizeof(TIn)==2 &&
466 boost::is_integral<TIn>::value>::type>
468 static IPSDK_FORCEINLINE
485 static IPSDK_FORCEINLINE
490 outl = _mm256_cvtps_pd(_mm256_extractf128_ps(in, 0));
491 outh = _mm256_cvtps_pd(_mm256_extractf128_ps(in, 1));
498 template <
typename TIn>
500 typename boost::enable_if_c<sizeof(TIn)==2>::type
503 static IPSDK_FORCEINLINE
508 out = _mm256_packus_epi16(inl, inh);
509 out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
515 template <
typename TIn>
517 typename boost::enable_if_c<sizeof(TIn)==2>::type>
519 static IPSDK_FORCEINLINE
524 out = _mm256_packs_epi16(inl, inh);
525 out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
531 template <
typename TIn>
533 typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
535 static IPSDK_FORCEINLINE
540 out = _mm256_packs_epi32(inl, inh);
541 out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
547 template <
typename TIn>
549 typename boost::enable_if_c<sizeof(TIn)==4 && boost::is_integral<TIn>::value>::type>
551 static IPSDK_FORCEINLINE
556 out = _mm256_packus_epi32(inl, inh);
557 out = _mm256_permute4x64_epi64(out, _MM_SHUFFLE(3, 1, 2, 0));
563 template <
typename TOut>
565 typename boost::enable_if_c<sizeof(TOut)==2>::type>
567 static IPSDK_FORCEINLINE
578 inlInt32, inhInt32, out);
587 static IPSDK_FORCEINLINE
592 out = _mm256_castps128_ps256(_mm256_cvtpd_ps(inl));
593 out = _mm256_insertf128_ps(out, _mm256_cvtpd_ps(inh), 1);
634 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_CASTREG_H__
int8_t ipInt8
Base types definition.
Definition: BaseTypes.h:48
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
int16_t ipInt16
Base types definition.
Definition: BaseTypes.h:50
uint8_t ipUInt8
Base types definition.
Definition: BaseTypes.h:49
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Advanced Vector Extensions 2.
Definition: InstructionSetTypes.h:48
Definition of import/export macro for library.
structure used to retrieve AVX type associated to a base type
Definition: AvxTypes.h:33
Definition: IsLessRegDecl.h:30
uint16_t ipUInt16
Base types definition.
Definition: BaseTypes.h:51
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53