32 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__ 33 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__ 35 #include <immintrin.h> 39 # define ALIGN32_BEG __declspec(align(32)) 43 # define ALIGN32_END __attribute__((aligned(32))) 51 #define _PI32AVX_CONST(Name, Val) \ 52 static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } 55 _PI32AVX_CONST(inv1, ~1);
61 #define _PS256_CONST(Name, Val) \ 62 static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 63 #define _PI32_CONST256(Name, Val) \ 64 static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 65 #define _PS256_CONST_TYPE(Name, Type, Val) \ 66 static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } 68 _PS256_CONST(1 , 1.0f);
69 _PS256_CONST(0p5, 0.5f);
71 _PS256_CONST_TYPE(min_norm_pos,
int, static_cast<int>(0x00800000));
72 _PS256_CONST_TYPE(mant_mask,
int, static_cast<int>(0x7f800000));
73 _PS256_CONST_TYPE(inv_mant_mask,
int, static_cast<int>(~0x7f800000));
75 _PS256_CONST_TYPE(sign_mask,
int, static_cast<int>(0x80000000));
76 _PS256_CONST_TYPE(inv_sign_mask,
int, static_cast<int>(~0x80000000));
80 _PI32_CONST256(inv1, ~1);
83 _PI32_CONST256(0x7f, 0x7f);
85 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
86 _PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
87 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1f);
88 _PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
89 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1f);
90 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1f);
91 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1f);
92 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1f);
93 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1f);
94 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1f);
95 _PS256_CONST(cephes_log_q1, -2.12194440e-4f);
96 _PS256_CONST(cephes_log_q2, 0.693359375f);
98 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 105 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ 106 imm_xmm_union u __attribute__((aligned(32))); \ 112 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ 113 imm_xmm_union u __attribute__((aligned(32))); \ 114 u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ 118 #define AVX2_BITOP_USING_SSE2(fn) \ 120 inline v8si custom_mm256_##fn(v8si x, int a) \ 125 COPY_IMM_TO_XMM(x, x1, x2); \ 126 x1 = _mm_##fn(x1,a); \ 127 x2 = _mm_##fn(x2,a); \ 128 COPY_XMM_TO_IMM(x1, x2, ret); \ 134 AVX2_BITOP_USING_SSE2(slli_epi32)
135 AVX2_BITOP_USING_SSE2(srli_epi32)
137 #define AVX2_INTOP_USING_SSE2(fn) \ 139 inline v8si custom_mm256_##fn(v8si x, v8si y) \ 145 COPY_IMM_TO_XMM(x, x1, x2); \ 146 COPY_IMM_TO_XMM(y, y1, y2); \ 147 x1 = _mm_##fn(x1,y1); \ 148 x2 = _mm_##fn(x2,y2); \ 149 COPY_XMM_TO_IMM(x1, x2, ret); \ 158 AVX2_INTOP_USING_SSE2(sub_epi32)
159 AVX2_INTOP_USING_SSE2(add_epi32)
168 v8sf log256_ps(v8sf x) {
170 v8sf one = *(v8sf*)_ps256_1;
173 v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
175 x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);
178 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 179 imm0 = custom_mm256_srli_epi32(_mm256_castps_si256(x), 23);
181 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
185 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
186 x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
190 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 191 imm0 = custom_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
193 imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
195 v8sf e = _mm256_cvtepi32_ps(imm0);
197 e = _mm256_add_ps(e, one);
206 v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
207 v8sf tmp = _mm256_and_ps(x, mask);
208 x = _mm256_sub_ps(x, one);
209 e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
210 x = _mm256_add_ps(x, tmp);
212 v8sf z = _mm256_mul_ps(x,x);
214 v8sf y = *(v8sf*)_ps256_cephes_log_p0;
215 y = _mm256_mul_ps(y, x);
216 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
217 y = _mm256_mul_ps(y, x);
218 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
219 y = _mm256_mul_ps(y, x);
220 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
221 y = _mm256_mul_ps(y, x);
222 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
223 y = _mm256_mul_ps(y, x);
224 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
225 y = _mm256_mul_ps(y, x);
226 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
227 y = _mm256_mul_ps(y, x);
228 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
229 y = _mm256_mul_ps(y, x);
230 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
231 y = _mm256_mul_ps(y, x);
233 y = _mm256_mul_ps(y, z);
235 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
236 y = _mm256_add_ps(y, tmp);
239 tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
240 y = _mm256_sub_ps(y, tmp);
242 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
243 x = _mm256_add_ps(x, y);
244 x = _mm256_add_ps(x, tmp);
247 _mm256_and_ps(_mm256_setzero_ps(), invalid_mask),
248 _mm256_andnot_ps(invalid_mask, x));
252 _PS256_CONST(exp_hi, 88.3762626647949f);
253 _PS256_CONST(exp_lo, -88.3762626647949f);
255 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
256 _PS256_CONST(cephes_exp_C1, 0.693359375f);
257 _PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
259 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
260 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
261 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
262 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
263 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
264 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
267 v8sf exp256_ps(v8sf x) {
268 v8sf tmp = _mm256_setzero_ps(), fx;
270 v8sf one = *(v8sf*)_ps256_1;
272 x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
273 x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
276 fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
277 fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
283 tmp = _mm256_floor_ps(fx);
287 v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
288 mask = _mm256_and_ps(mask, one);
289 fx = _mm256_sub_ps(tmp, mask);
291 tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
292 v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
293 x = _mm256_sub_ps(x, tmp);
294 x = _mm256_sub_ps(x, z);
296 z = _mm256_mul_ps(x,x);
298 v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
299 y = _mm256_mul_ps(y, x);
300 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
301 y = _mm256_mul_ps(y, x);
302 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
303 y = _mm256_mul_ps(y, x);
304 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
305 y = _mm256_mul_ps(y, x);
306 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
307 y = _mm256_mul_ps(y, x);
308 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
309 y = _mm256_mul_ps(y, z);
310 y = _mm256_add_ps(y, x);
311 y = _mm256_add_ps(y, one);
314 imm0 = _mm256_cvttps_epi32(fx);
317 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 318 imm0 = custom_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
319 imm0 = custom_mm256_slli_epi32(imm0, 23);
321 imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
322 imm0 = _mm256_slli_epi32(imm0, 23);
325 v8sf pow2n = _mm256_castsi256_ps(imm0);
326 y = _mm256_mul_ps(y, pow2n);
330 _PS256_CONST(minus_cephes_DP1, -0.78515625f);
331 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
332 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
333 _PS256_CONST(sincof_p0, -1.9515295891E-4f);
334 _PS256_CONST(sincof_p1, 8.3321608736E-3f);
335 _PS256_CONST(sincof_p2, -1.6666654611E-1f);
336 _PS256_CONST(coscof_p0, 2.443315711809948E-005f);
337 _PS256_CONST(coscof_p1, -1.388731625493765E-003f);
338 _PS256_CONST(coscof_p2, 4.166664568298827E-002f);
339 _PS256_CONST(cephes_FOPI, 1.27323954473516f);
355 v8sf sin256_ps(v8sf x) {
356 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
359 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 366 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
368 sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
371 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
379 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700) 381 imm2 = _mm256_cvttps_epi32(y);
384 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
385 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
386 y = _mm256_cvtepi32_ps(imm2);
389 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
390 imm0 = _mm256_slli_epi32(imm0, 29);
397 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
398 imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
401 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
403 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
404 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
406 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
407 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
409 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
410 y = _mm256_cvtepi32_ps(imm2);
412 imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
413 imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
415 imm0_1 = _mm_slli_epi32(imm0_1, 29);
416 imm0_2 = _mm_slli_epi32(imm0_2, 29);
418 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
420 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
421 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
423 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
424 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
426 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
429 v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
430 v8sf poly_mask = _mm256_castsi256_ps(imm2);
431 sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
435 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
436 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
437 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
438 xmm1 = _mm256_mul_ps(y, xmm1);
439 xmm2 = _mm256_mul_ps(y, xmm2);
440 xmm3 = _mm256_mul_ps(y, xmm3);
441 x = _mm256_add_ps(x, xmm1);
442 x = _mm256_add_ps(x, xmm2);
443 x = _mm256_add_ps(x, xmm3);
446 y = *(v8sf*)_ps256_coscof_p0;
447 v8sf z = _mm256_mul_ps(x,x);
449 y = _mm256_mul_ps(y, z);
450 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
451 y = _mm256_mul_ps(y, z);
452 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
453 y = _mm256_mul_ps(y, z);
454 y = _mm256_mul_ps(y, z);
455 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
456 y = _mm256_sub_ps(y, tmp);
457 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
461 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
462 y2 = _mm256_mul_ps(y2, z);
463 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
464 y2 = _mm256_mul_ps(y2, z);
465 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
466 y2 = _mm256_mul_ps(y2, z);
467 y2 = _mm256_mul_ps(y2, x);
468 y2 = _mm256_add_ps(y2, x);
472 y2 = _mm256_and_ps(xmm3, y2);
473 y = _mm256_andnot_ps(xmm3, y);
474 y = _mm256_add_ps(y,y2);
476 y = _mm256_xor_ps(y, sign_bit);
483 v8sf cos256_ps(v8sf x) {
484 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
487 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 493 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
496 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
498 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700) 500 imm2 = _mm256_cvttps_epi32(y);
502 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
503 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
504 y = _mm256_cvtepi32_ps(imm2);
505 imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
508 imm0 = _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
509 imm0 = _mm256_slli_epi32(imm0, 29);
511 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
512 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
516 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
518 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
519 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
521 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
522 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
524 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
525 y = _mm256_cvtepi32_ps(imm2);
527 imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
528 imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
530 imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
531 imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
533 imm0_1 = _mm_slli_epi32(imm0_1, 29);
534 imm0_2 = _mm_slli_epi32(imm0_2, 29);
536 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
538 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
539 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
541 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
542 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
544 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
547 v8sf sign_bit = _mm256_castsi256_ps(imm0);
548 v8sf poly_mask = _mm256_castsi256_ps(imm2);
552 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
553 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
554 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
555 xmm1 = _mm256_mul_ps(y, xmm1);
556 xmm2 = _mm256_mul_ps(y, xmm2);
557 xmm3 = _mm256_mul_ps(y, xmm3);
558 x = _mm256_add_ps(x, xmm1);
559 x = _mm256_add_ps(x, xmm2);
560 x = _mm256_add_ps(x, xmm3);
563 y = *(v8sf*)_ps256_coscof_p0;
564 v8sf z = _mm256_mul_ps(x,x);
566 y = _mm256_mul_ps(y, z);
567 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
568 y = _mm256_mul_ps(y, z);
569 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
570 y = _mm256_mul_ps(y, z);
571 y = _mm256_mul_ps(y, z);
572 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
573 y = _mm256_sub_ps(y, tmp);
574 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
578 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
579 y2 = _mm256_mul_ps(y2, z);
580 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
581 y2 = _mm256_mul_ps(y2, z);
582 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
583 y2 = _mm256_mul_ps(y2, z);
584 y2 = _mm256_mul_ps(y2, x);
585 y2 = _mm256_add_ps(y2, x);
589 y2 = _mm256_and_ps(xmm3, y2);
590 y = _mm256_andnot_ps(xmm3, y);
591 y = _mm256_add_ps(y,y2);
593 y = _mm256_xor_ps(y, sign_bit);
601 void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
603 v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
604 v8si imm0, imm2, imm4;
606 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700) 614 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
616 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
619 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
621 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700) 623 imm2 = _mm256_cvttps_epi32(y);
626 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
627 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
629 y = _mm256_cvtepi32_ps(imm2);
633 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
634 imm0 = _mm256_slli_epi32(imm0, 29);
638 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
639 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
643 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
645 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
646 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
648 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
649 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
651 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
652 y = _mm256_cvtepi32_ps(imm2);
657 imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
658 imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
660 imm0_1 = _mm_slli_epi32(imm0_1, 29);
661 imm0_2 = _mm_slli_epi32(imm0_2, 29);
663 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
665 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
666 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
668 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
669 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
671 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
673 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
674 v8sf poly_mask = _mm256_castsi256_ps(imm2);
678 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
679 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
680 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
681 xmm1 = _mm256_mul_ps(y, xmm1);
682 xmm2 = _mm256_mul_ps(y, xmm2);
683 xmm3 = _mm256_mul_ps(y, xmm3);
684 x = _mm256_add_ps(x, xmm1);
685 x = _mm256_add_ps(x, xmm2);
686 x = _mm256_add_ps(x, xmm3);
688 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700) 689 imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
690 imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
691 imm4 = _mm256_slli_epi32(imm4, 29);
693 imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
694 imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
696 imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
697 imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
699 imm4_1 = _mm_slli_epi32(imm4_1, 29);
700 imm4_2 = _mm_slli_epi32(imm4_2, 29);
702 COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
705 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
707 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
710 v8sf z = _mm256_mul_ps(x,x);
711 y = *(v8sf*)_ps256_coscof_p0;
713 y = _mm256_mul_ps(y, z);
714 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
715 y = _mm256_mul_ps(y, z);
716 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
717 y = _mm256_mul_ps(y, z);
718 y = _mm256_mul_ps(y, z);
719 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
720 y = _mm256_sub_ps(y, tmp);
721 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
725 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
726 y2 = _mm256_mul_ps(y2, z);
727 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
728 y2 = _mm256_mul_ps(y2, z);
729 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
730 y2 = _mm256_mul_ps(y2, z);
731 y2 = _mm256_mul_ps(y2, x);
732 y2 = _mm256_add_ps(y2, x);
736 v8sf ysin2 = _mm256_and_ps(xmm3, y2);
737 v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
738 y2 = _mm256_sub_ps(y2,ysin2);
739 y = _mm256_sub_ps(y, ysin1);
741 xmm1 = _mm256_add_ps(ysin1,ysin2);
742 xmm2 = _mm256_add_ps(y,y2);
745 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
746 *c = _mm256_xor_ps(xmm2, sign_bit_cos);
749 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__ Definition: avx_mathfun.h:100