32 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H__ 33 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H__ 37 #include <xmmintrin.h> 42 # define ALIGN16_BEG __declspec(align(16)) 46 # define ALIGN16_END __attribute__((aligned(16))) 53 # include <emmintrin.h> 60 #define _PS_CONST(Name, Val) \ 61 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 62 #define _PI32_CONST(Name, Val) \ 63 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 64 #define _PS_CONST_TYPE(Name, Type, Val) \ 65 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 70 _PS_CONST_TYPE(min_norm_pos,
int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask,
int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask,
int, ~0x7f800000);
74 _PS_CONST_TYPE(sign_mask,
int, (
int)0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask,
int, ~0x80000000);
78 _PI32_CONST(inv1, ~1);
81 _PI32_CONST(0x7f, 0x7f);
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2f);
85 _PS_CONST(cephes_log_p1, - 1.1514610310E-1f);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1f);
87 _PS_CONST(cephes_log_p3, - 1.2420140846E-1f);
88 _PS_CONST(cephes_log_p4, + 1.4249322787E-1f);
89 _PS_CONST(cephes_log_p5, - 1.6668057665E-1f);
90 _PS_CONST(cephes_log_p6, + 2.0000714765E-1f);
91 _PS_CONST(cephes_log_p7, - 2.4999993993E-1f);
92 _PS_CONST(cephes_log_p8, + 3.3333331174E-1f);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4f);
94 _PS_CONST(cephes_log_q2, 0.693359375f);
97 typedef union xmm_mm_union {
102 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ 103 xmm_mm_union u; u.xmm = xmm_; \ 108 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ 109 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ 118 v4sf log_ps(v4sf x) {
124 v4sf one = *(v4sf*)_ps_1;
126 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
128 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);
132 COPY_XMM_TO_MM(x, mm0, mm1);
133 mm0 = _mm_srli_pi32(mm0, 23);
134 mm1 = _mm_srli_pi32(mm1, 23);
136 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
139 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
140 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
144 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
145 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
146 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
149 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
150 v4sf e = _mm_cvtepi32_ps(emm0);
153 e = _mm_add_ps(e, one);
161 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
162 v4sf tmp = _mm_and_ps(x, mask);
163 x = _mm_sub_ps(x, one);
164 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
165 x = _mm_add_ps(x, tmp);
168 v4sf z = _mm_mul_ps(x,x);
170 v4sf y = *(v4sf*)_ps_cephes_log_p0;
171 y = _mm_mul_ps(y, x);
172 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
173 y = _mm_mul_ps(y, x);
174 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
175 y = _mm_mul_ps(y, x);
176 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
177 y = _mm_mul_ps(y, x);
178 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
179 y = _mm_mul_ps(y, x);
180 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
181 y = _mm_mul_ps(y, x);
182 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
183 y = _mm_mul_ps(y, x);
184 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
185 y = _mm_mul_ps(y, x);
186 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
187 y = _mm_mul_ps(y, x);
189 y = _mm_mul_ps(y, z);
192 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
193 y = _mm_add_ps(y, tmp);
196 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
197 y = _mm_sub_ps(y, tmp);
199 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
200 x = _mm_add_ps(x, y);
201 x = _mm_add_ps(x, tmp);
205 _mm_and_ps(_mm_setzero_ps(), invalid_mask),
206 _mm_andnot_ps(invalid_mask, x));
210 _PS_CONST(exp_hi, 88.3762626647949f);
211 _PS_CONST(exp_lo, -88.3762626647949f);
213 _PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
214 _PS_CONST(cephes_exp_C1, 0.693359375f);
215 _PS_CONST(cephes_exp_C2, -2.12194440e-4f);
217 _PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
218 _PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
219 _PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
220 _PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
221 _PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
222 _PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
225 v4sf exp_ps(v4sf x) {
226 v4sf tmp = _mm_setzero_ps(), fx;
232 v4sf one = *(v4sf*)_ps_1;
234 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
235 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
238 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
239 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
244 tmp = _mm_movehl_ps(tmp, fx);
245 mm0 = _mm_cvttps_pi32(fx);
246 mm1 = _mm_cvttps_pi32(tmp);
248 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
250 emm0 = _mm_cvttps_epi32(fx);
251 tmp = _mm_cvtepi32_ps(emm0);
254 v4sf mask = _mm_cmpgt_ps(tmp, fx);
255 mask = _mm_and_ps(mask, one);
256 fx = _mm_sub_ps(tmp, mask);
258 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
259 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
260 x = _mm_sub_ps(x, tmp);
261 x = _mm_sub_ps(x, z);
265 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
266 y = _mm_mul_ps(y, x);
267 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
268 y = _mm_mul_ps(y, x);
269 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
270 y = _mm_mul_ps(y, x);
271 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
272 y = _mm_mul_ps(y, x);
273 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
274 y = _mm_mul_ps(y, x);
275 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
276 y = _mm_mul_ps(y, z);
277 y = _mm_add_ps(y, x);
278 y = _mm_add_ps(y, one);
282 z = _mm_movehl_ps(z, fx);
283 mm0 = _mm_cvttps_pi32(fx);
284 mm1 = _mm_cvttps_pi32(z);
285 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
286 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
287 mm0 = _mm_slli_pi32(mm0, 23);
288 mm1 = _mm_slli_pi32(mm1, 23);
291 COPY_MM_TO_XMM(mm0, mm1, pow2n);
294 emm0 = _mm_cvttps_epi32(fx);
295 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
296 emm0 = _mm_slli_epi32(emm0, 23);
297 v4sf pow2n = _mm_castsi128_ps(emm0);
299 y = _mm_mul_ps(y, pow2n);
303 _PS_CONST(minus_cephes_DP1, -0.78515625f);
304 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
305 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
306 _PS_CONST(sincof_p0, -1.9515295891E-4f);
307 _PS_CONST(sincof_p1, 8.3321608736E-3f);
308 _PS_CONST(sincof_p2, -1.6666654611E-1f);
309 _PS_CONST(coscof_p0, 2.443315711809948E-005f);
310 _PS_CONST(coscof_p1, -1.388731625493765E-003f);
311 _PS_CONST(coscof_p2, 4.166664568298827E-002f);
312 _PS_CONST(cephes_FOPI, 1.27323954473516f);
344 v4sf sin_ps(v4sf x) {
345 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
350 v2si mm0, mm1, mm2, mm3;
354 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
356 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
359 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
363 emm2 = _mm_cvttps_epi32(y);
365 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
366 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
367 y = _mm_cvtepi32_ps(emm2);
370 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
371 emm0 = _mm_slli_epi32(emm0, 29);
378 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
379 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
381 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
382 v4sf poly_mask = _mm_castsi128_ps(emm2);
383 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
387 xmm2 = _mm_movehl_ps(xmm2, y);
388 mm2 = _mm_cvttps_pi32(y);
389 mm3 = _mm_cvttps_pi32(xmm2);
391 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
392 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
393 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
394 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
395 y = _mm_cvtpi32x2_ps(mm2, mm3);
397 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
398 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
399 mm0 = _mm_slli_pi32(mm0, 29);
400 mm1 = _mm_slli_pi32(mm1, 29);
402 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
403 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
404 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
405 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
406 v4sf swap_sign_bit, poly_mask;
407 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
408 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
409 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
415 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
416 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
417 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
418 xmm1 = _mm_mul_ps(y, xmm1);
419 xmm2 = _mm_mul_ps(y, xmm2);
420 xmm3 = _mm_mul_ps(y, xmm3);
421 x = _mm_add_ps(x, xmm1);
422 x = _mm_add_ps(x, xmm2);
423 x = _mm_add_ps(x, xmm3);
426 y = *(v4sf*)_ps_coscof_p0;
427 v4sf z = _mm_mul_ps(x,x);
429 y = _mm_mul_ps(y, z);
430 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
431 y = _mm_mul_ps(y, z);
432 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
433 y = _mm_mul_ps(y, z);
434 y = _mm_mul_ps(y, z);
435 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
436 y = _mm_sub_ps(y, tmp);
437 y = _mm_add_ps(y, *(v4sf*)_ps_1);
441 v4sf y2 = *(v4sf*)_ps_sincof_p0;
442 y2 = _mm_mul_ps(y2, z);
443 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
444 y2 = _mm_mul_ps(y2, z);
445 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
446 y2 = _mm_mul_ps(y2, z);
447 y2 = _mm_mul_ps(y2, x);
448 y2 = _mm_add_ps(y2, x);
452 y2 = _mm_and_ps(xmm3, y2);
453 y = _mm_andnot_ps(xmm3, y);
454 y = _mm_add_ps(y,y2);
456 y = _mm_xor_ps(y, sign_bit);
462 v4sf cos_ps(v4sf x) {
463 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
467 v2si mm0, mm1, mm2, mm3;
470 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
473 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
477 emm2 = _mm_cvttps_epi32(y);
479 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
480 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
481 y = _mm_cvtepi32_ps(emm2);
483 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
486 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
487 emm0 = _mm_slli_epi32(emm0, 29);
489 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
490 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
492 v4sf sign_bit = _mm_castsi128_ps(emm0);
493 v4sf poly_mask = _mm_castsi128_ps(emm2);
496 xmm2 = _mm_movehl_ps(xmm2, y);
497 mm2 = _mm_cvttps_pi32(y);
498 mm3 = _mm_cvttps_pi32(xmm2);
501 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
502 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
503 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
504 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
506 y = _mm_cvtpi32x2_ps(mm2, mm3);
509 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
510 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
515 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
516 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
517 mm0 = _mm_slli_pi32(mm0, 29);
518 mm1 = _mm_slli_pi32(mm1, 29);
520 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
521 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
523 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
524 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
526 v4sf sign_bit, poly_mask;
527 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
528 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
533 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
534 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
535 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
536 xmm1 = _mm_mul_ps(y, xmm1);
537 xmm2 = _mm_mul_ps(y, xmm2);
538 xmm3 = _mm_mul_ps(y, xmm3);
539 x = _mm_add_ps(x, xmm1);
540 x = _mm_add_ps(x, xmm2);
541 x = _mm_add_ps(x, xmm3);
544 y = *(v4sf*)_ps_coscof_p0;
545 v4sf z = _mm_mul_ps(x,x);
547 y = _mm_mul_ps(y, z);
548 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
549 y = _mm_mul_ps(y, z);
550 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
551 y = _mm_mul_ps(y, z);
552 y = _mm_mul_ps(y, z);
553 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
554 y = _mm_sub_ps(y, tmp);
555 y = _mm_add_ps(y, *(v4sf*)_ps_1);
559 v4sf y2 = *(v4sf*)_ps_sincof_p0;
560 y2 = _mm_mul_ps(y2, z);
561 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
562 y2 = _mm_mul_ps(y2, z);
563 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
564 y2 = _mm_mul_ps(y2, z);
565 y2 = _mm_mul_ps(y2, x);
566 y2 = _mm_add_ps(y2, x);
570 y2 = _mm_and_ps(xmm3, y2);
571 y = _mm_andnot_ps(xmm3, y);
572 y = _mm_add_ps(y,y2);
574 y = _mm_xor_ps(y, sign_bit);
582 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
583 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
585 v4si emm0, emm2, emm4;
587 v2si mm0, mm1, mm2, mm3, mm4, mm5;
591 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
593 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
596 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
600 emm2 = _mm_cvttps_epi32(y);
603 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
604 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
605 y = _mm_cvtepi32_ps(emm2);
610 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
611 emm0 = _mm_slli_epi32(emm0, 29);
612 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
615 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
616 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
617 v4sf poly_mask = _mm_castsi128_ps(emm2);
620 xmm3 = _mm_movehl_ps(xmm3, y);
621 mm2 = _mm_cvttps_pi32(y);
622 mm3 = _mm_cvttps_pi32(xmm3);
625 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
626 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
627 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
628 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
630 y = _mm_cvtpi32x2_ps(mm2, mm3);
636 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
637 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
638 mm0 = _mm_slli_pi32(mm0, 29);
639 mm1 = _mm_slli_pi32(mm1, 29);
640 v4sf swap_sign_bit_sin;
641 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
645 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
646 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
647 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
648 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
650 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
655 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
656 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
657 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
658 xmm1 = _mm_mul_ps(y, xmm1);
659 xmm2 = _mm_mul_ps(y, xmm2);
660 xmm3 = _mm_mul_ps(y, xmm3);
661 x = _mm_add_ps(x, xmm1);
662 x = _mm_add_ps(x, xmm2);
663 x = _mm_add_ps(x, xmm3);
666 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
667 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
668 emm4 = _mm_slli_epi32(emm4, 29);
669 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
672 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
673 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
674 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
675 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
676 mm4 = _mm_slli_pi32(mm4, 29);
677 mm5 = _mm_slli_pi32(mm5, 29);
679 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
683 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
687 v4sf z = _mm_mul_ps(x,x);
688 y = *(v4sf*)_ps_coscof_p0;
690 y = _mm_mul_ps(y, z);
691 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
692 y = _mm_mul_ps(y, z);
693 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
694 y = _mm_mul_ps(y, z);
695 y = _mm_mul_ps(y, z);
696 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
697 y = _mm_sub_ps(y, tmp);
698 y = _mm_add_ps(y, *(v4sf*)_ps_1);
702 v4sf y2 = *(v4sf*)_ps_sincof_p0;
703 y2 = _mm_mul_ps(y2, z);
704 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
705 y2 = _mm_mul_ps(y2, z);
706 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
707 y2 = _mm_mul_ps(y2, z);
708 y2 = _mm_mul_ps(y2, x);
709 y2 = _mm_add_ps(y2, x);
713 v4sf ysin2 = _mm_and_ps(xmm3, y2);
714 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
715 y2 = _mm_sub_ps(y2,ysin2);
716 y = _mm_sub_ps(y, ysin1);
718 xmm1 = _mm_add_ps(ysin1,ysin2);
719 xmm2 = _mm_add_ps(y,y2);
722 *s = _mm_xor_ps(xmm1, sign_bit_sin);
723 *c = _mm_xor_ps(xmm2, sign_bit_cos);
726 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H