IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
sse_mathfun.h
1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2 
3  Inspired by Intel Approximate Math library, and based on the
4  corresponding algorithms of the cephes math library
5 
6  The default is to use the SSE1 version. If you define USE_SSE2 the
7  the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8  not expect any significant performance improvement with SSE2.
9 */
10 
11 /* Copyright (C) 2007 Julien Pommier
12 
13  This software is provided 'as-is', without any express or implied
14  warranty. In no event will the authors be held liable for any damages
15  arising from the use of this software.
16 
17  Permission is granted to anyone to use this software for any purpose,
18  including commercial applications, and to alter it and redistribute it
19  freely, subject to the following restrictions:
20 
21  1. The origin of this software must not be misrepresented; you must not
22  claim that you wrote the original software. If you use this software
23  in a product, an acknowledgment in the product documentation would be
24  appreciated but is not required.
25  2. Altered source versions must be plainly marked as such, and must not be
26  misrepresented as being the original software.
27  3. This notice may not be removed or altered from any source distribution.
28 
29  (this is the zlib license)
30 */
31 
32 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H__
33 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H__
34 
35 #define USE_SSE2
36 
37 #include <xmmintrin.h>
38 
39 /* yes I know, the top of this file is quite ugly */
40 
41 #ifdef _MSC_VER /* visual c++ */
42 # define ALIGN16_BEG __declspec(align(16))
43 # define ALIGN16_END
44 #else /* gcc or icc */
45 # define ALIGN16_BEG
46 # define ALIGN16_END __attribute__((aligned(16)))
47 #endif
48 
49 /* __m128 is ugly to write */
50 typedef __m128 v4sf; // vector of 4 float (sse1)
51 
52 #ifdef USE_SSE2
53 # include <emmintrin.h>
54 typedef __m128i v4si; // vector of 4 int (sse2)
55 #else
56 typedef __m64 v2si; // vector of 2 int (mmx)
57 #endif
58 
59 /* declare some SSE constants -- why can't I figure a better way to do that? */
60 #define _PS_CONST(Name, Val) \
61  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
62 #define _PI32_CONST(Name, Val) \
63  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
64 #define _PS_CONST_TYPE(Name, Type, Val) \
65  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
66 
67 _PS_CONST(1 , 1.0f);
68 _PS_CONST(0p5, 0.5f);
69 /* the smallest non denormalized float number */
70 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
73 
74 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
76 
77 _PI32_CONST(1, 1);
78 _PI32_CONST(inv1, ~1);
79 _PI32_CONST(2, 2);
80 _PI32_CONST(4, 4);
81 _PI32_CONST(0x7f, 0x7f);
82 
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2f);
85 _PS_CONST(cephes_log_p1, - 1.1514610310E-1f);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1f);
87 _PS_CONST(cephes_log_p3, - 1.2420140846E-1f);
88 _PS_CONST(cephes_log_p4, + 1.4249322787E-1f);
89 _PS_CONST(cephes_log_p5, - 1.6668057665E-1f);
90 _PS_CONST(cephes_log_p6, + 2.0000714765E-1f);
91 _PS_CONST(cephes_log_p7, - 2.4999993993E-1f);
92 _PS_CONST(cephes_log_p8, + 3.3333331174E-1f);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4f);
94 _PS_CONST(cephes_log_q2, 0.693359375f);
95 
96 #ifndef USE_SSE2
97 typedef union xmm_mm_union {
98  __m128 xmm;
99  __m64 mm[2];
100 } xmm_mm_union;
101 
102 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
103  xmm_mm_union u; u.xmm = xmm_; \
104  mm0_ = u.mm[0]; \
105  mm1_ = u.mm[1]; \
106 }
107 
108 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
109  xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
110  }
111 
112 #endif // USE_SSE2
113 
114 /* natural logarithm computed for 4 simultaneous float
115  return NaN for x <= 0
116 */
117 inline
118 v4sf log_ps(v4sf x) {
119 #ifdef USE_SSE2
120  v4si emm0;
121 #else
122  v2si mm0, mm1;
123 #endif
124  v4sf one = *(v4sf*)_ps_1;
125 
126  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
127 
128  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
129 
130 #ifndef USE_SSE2
131  /* part 1: x = frexpf(x, &e); */
132  COPY_XMM_TO_MM(x, mm0, mm1);
133  mm0 = _mm_srli_pi32(mm0, 23);
134  mm1 = _mm_srli_pi32(mm1, 23);
135 #else
136  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
137 #endif
138  /* keep only the fractional part */
139  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
140  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
141 
142 #ifndef USE_SSE2
143  /* now e=mm0:mm1 contain the really base-2 exponent */
144  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
145  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
146  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
147  _mm_empty(); /* bye bye mmx */
148 #else
149  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
150  v4sf e = _mm_cvtepi32_ps(emm0);
151 #endif
152 
153  e = _mm_add_ps(e, one);
154 
155  /* part2:
156  if( x < SQRTHF ) {
157  e -= 1;
158  x = x + x - 1.0;
159  } else { x = x - 1.0; }
160  */
161  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
162  v4sf tmp = _mm_and_ps(x, mask);
163  x = _mm_sub_ps(x, one);
164  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
165  x = _mm_add_ps(x, tmp);
166 
167 
168  v4sf z = _mm_mul_ps(x,x);
169 
170  v4sf y = *(v4sf*)_ps_cephes_log_p0;
171  y = _mm_mul_ps(y, x);
172  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
173  y = _mm_mul_ps(y, x);
174  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
175  y = _mm_mul_ps(y, x);
176  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
177  y = _mm_mul_ps(y, x);
178  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
179  y = _mm_mul_ps(y, x);
180  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
181  y = _mm_mul_ps(y, x);
182  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
183  y = _mm_mul_ps(y, x);
184  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
185  y = _mm_mul_ps(y, x);
186  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
187  y = _mm_mul_ps(y, x);
188 
189  y = _mm_mul_ps(y, z);
190 
191 
192  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
193  y = _mm_add_ps(y, tmp);
194 
195 
196  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
197  y = _mm_sub_ps(y, tmp);
198 
199  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
200  x = _mm_add_ps(x, y);
201  x = _mm_add_ps(x, tmp);
202  //x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
203  //x = _mm_or_ps(x, invalid_mask);
204  x = _mm_or_ps(
205  _mm_and_ps(_mm_setzero_ps(), invalid_mask),
206  _mm_andnot_ps(invalid_mask, x)); // negative or null arg will be 0
207  return x;
208 }
209 
210 _PS_CONST(exp_hi, 88.3762626647949f);
211 _PS_CONST(exp_lo, -88.3762626647949f);
212 
213 _PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
214 _PS_CONST(cephes_exp_C1, 0.693359375f);
215 _PS_CONST(cephes_exp_C2, -2.12194440e-4f);
216 
217 _PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
218 _PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
219 _PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
220 _PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
221 _PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
222 _PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
223 
224 inline
225 v4sf exp_ps(v4sf x) {
226  v4sf tmp = _mm_setzero_ps(), fx;
227 #ifdef USE_SSE2
228  v4si emm0;
229 #else
230  v2si mm0, mm1;
231 #endif
232  v4sf one = *(v4sf*)_ps_1;
233 
234  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
235  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
236 
237  /* express exp(x) as exp(g + n*log(2)) */
238  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
239  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
240 
241  /* how to perform a floorf with SSE: just below */
242 #ifndef USE_SSE2
243  /* step 1 : cast to int */
244  tmp = _mm_movehl_ps(tmp, fx);
245  mm0 = _mm_cvttps_pi32(fx);
246  mm1 = _mm_cvttps_pi32(tmp);
247  /* step 2 : cast back to float */
248  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
249 #else
250  emm0 = _mm_cvttps_epi32(fx);
251  tmp = _mm_cvtepi32_ps(emm0);
252 #endif
253  /* if greater, substract 1 */
254  v4sf mask = _mm_cmpgt_ps(tmp, fx);
255  mask = _mm_and_ps(mask, one);
256  fx = _mm_sub_ps(tmp, mask);
257 
258  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
259  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
260  x = _mm_sub_ps(x, tmp);
261  x = _mm_sub_ps(x, z);
262 
263  z = _mm_mul_ps(x,x);
264 
265  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
266  y = _mm_mul_ps(y, x);
267  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
268  y = _mm_mul_ps(y, x);
269  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
270  y = _mm_mul_ps(y, x);
271  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
272  y = _mm_mul_ps(y, x);
273  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
274  y = _mm_mul_ps(y, x);
275  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
276  y = _mm_mul_ps(y, z);
277  y = _mm_add_ps(y, x);
278  y = _mm_add_ps(y, one);
279 
280  /* build 2^n */
281 #ifndef USE_SSE2
282  z = _mm_movehl_ps(z, fx);
283  mm0 = _mm_cvttps_pi32(fx);
284  mm1 = _mm_cvttps_pi32(z);
285  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
286  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
287  mm0 = _mm_slli_pi32(mm0, 23);
288  mm1 = _mm_slli_pi32(mm1, 23);
289 
290  v4sf pow2n;
291  COPY_MM_TO_XMM(mm0, mm1, pow2n);
292  _mm_empty();
293 #else
294  emm0 = _mm_cvttps_epi32(fx);
295  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
296  emm0 = _mm_slli_epi32(emm0, 23);
297  v4sf pow2n = _mm_castsi128_ps(emm0);
298 #endif
299  y = _mm_mul_ps(y, pow2n);
300  return y;
301 }
302 
303 _PS_CONST(minus_cephes_DP1, -0.78515625f);
304 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
305 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
306 _PS_CONST(sincof_p0, -1.9515295891E-4f);
307 _PS_CONST(sincof_p1, 8.3321608736E-3f);
308 _PS_CONST(sincof_p2, -1.6666654611E-1f);
309 _PS_CONST(coscof_p0, 2.443315711809948E-005f);
310 _PS_CONST(coscof_p1, -1.388731625493765E-003f);
311 _PS_CONST(coscof_p2, 4.166664568298827E-002f);
312 _PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
313 
314 
315 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
316  it runs also on old athlons XPs and the pentium III of your grand
317  mother.
318 
319  The code is the exact rewriting of the cephes sinf function.
320  Precision is excellent as long as x < 8192 (I did not bother to
321  take into account the special handling they have for greater values
322  -- it does not return garbage for arguments over 8192, though, but
323  the extra precision is missing).
324 
325  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
326  surprising but correct result.
327 
328  Performance is also surprisingly good, 1.33 times faster than the
329  macos vsinf SSE2 function, and 1.5 times faster than the
330  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
331  too bad for an SSE1 function (with no special tuning) !
332  However the latter libraries probably have a much better handling of NaN,
333  Inf, denormalized and other special arguments..
334 
335  On my core 1 duo, the execution of this function takes approximately 95 cycles.
336 
337  From what I have observed on the experiments with Intel AMath lib, switching to an
338  SSE2 version would improve the perf by only 10%.
339 
340  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
341  deliver full speed.
342 */
343 inline
344 v4sf sin_ps(v4sf x) { // any x
345  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
346 
347 #ifdef USE_SSE2
348  v4si emm0, emm2;
349 #else
350  v2si mm0, mm1, mm2, mm3;
351 #endif
352  sign_bit = x;
353  /* take the absolute value */
354  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
355  /* extract the sign bit (upper one) */
356  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
357 
358  /* scale by 4/Pi */
359  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
360 
361 #ifdef USE_SSE2
362  /* store the integer part of y in mm0 */
363  emm2 = _mm_cvttps_epi32(y);
364  /* j=(j+1) & (~1) (see the cephes sources) */
365  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
366  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
367  y = _mm_cvtepi32_ps(emm2);
368 
369  /* get the swap sign flag */
370  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
371  emm0 = _mm_slli_epi32(emm0, 29);
372  /* get the polynom selection mask
373  there is one polynom for 0 <= x <= Pi/4
374  and another one for Pi/4<x<=Pi/2
375 
376  Both branches will be computed.
377  */
378  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
379  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
380 
381  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
382  v4sf poly_mask = _mm_castsi128_ps(emm2);
383  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
384 
385 #else
386  /* store the integer part of y in mm0:mm1 */
387  xmm2 = _mm_movehl_ps(xmm2, y);
388  mm2 = _mm_cvttps_pi32(y);
389  mm3 = _mm_cvttps_pi32(xmm2);
390  /* j=(j+1) & (~1) (see the cephes sources) */
391  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
392  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
393  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
394  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
395  y = _mm_cvtpi32x2_ps(mm2, mm3);
396  /* get the swap sign flag */
397  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
398  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
399  mm0 = _mm_slli_pi32(mm0, 29);
400  mm1 = _mm_slli_pi32(mm1, 29);
401  /* get the polynom selection mask */
402  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
403  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
404  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
405  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
406  v4sf swap_sign_bit, poly_mask;
407  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
408  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
409  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
410  _mm_empty(); /* good-bye mmx */
411 #endif
412 
413  /* The magic pass: "Extended precision modular arithmetic"
414  x = ((x - y * DP1) - y * DP2) - y * DP3; */
415  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
416  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
417  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
418  xmm1 = _mm_mul_ps(y, xmm1);
419  xmm2 = _mm_mul_ps(y, xmm2);
420  xmm3 = _mm_mul_ps(y, xmm3);
421  x = _mm_add_ps(x, xmm1);
422  x = _mm_add_ps(x, xmm2);
423  x = _mm_add_ps(x, xmm3);
424 
425  /* Evaluate the first polynom (0 <= x <= Pi/4) */
426  y = *(v4sf*)_ps_coscof_p0;
427  v4sf z = _mm_mul_ps(x,x);
428 
429  y = _mm_mul_ps(y, z);
430  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
431  y = _mm_mul_ps(y, z);
432  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
433  y = _mm_mul_ps(y, z);
434  y = _mm_mul_ps(y, z);
435  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
436  y = _mm_sub_ps(y, tmp);
437  y = _mm_add_ps(y, *(v4sf*)_ps_1);
438 
439  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
440 
441  v4sf y2 = *(v4sf*)_ps_sincof_p0;
442  y2 = _mm_mul_ps(y2, z);
443  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
444  y2 = _mm_mul_ps(y2, z);
445  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
446  y2 = _mm_mul_ps(y2, z);
447  y2 = _mm_mul_ps(y2, x);
448  y2 = _mm_add_ps(y2, x);
449 
450  /* select the correct result from the two polynoms */
451  xmm3 = poly_mask;
452  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
453  y = _mm_andnot_ps(xmm3, y);
454  y = _mm_add_ps(y,y2);
455  /* update the sign */
456  y = _mm_xor_ps(y, sign_bit);
457  return y;
458 }
459 
460 /* almost the same as sin_ps */
461 inline
462 v4sf cos_ps(v4sf x) { // any x
463  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
464 #ifdef USE_SSE2
465  v4si emm0, emm2;
466 #else
467  v2si mm0, mm1, mm2, mm3;
468 #endif
469  /* take the absolute value */
470  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
471 
472  /* scale by 4/Pi */
473  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
474 
475 #ifdef USE_SSE2
476  /* store the integer part of y in mm0 */
477  emm2 = _mm_cvttps_epi32(y);
478  /* j=(j+1) & (~1) (see the cephes sources) */
479  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
480  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
481  y = _mm_cvtepi32_ps(emm2);
482 
483  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
484 
485  /* get the swap sign flag */
486  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
487  emm0 = _mm_slli_epi32(emm0, 29);
488  /* get the polynom selection mask */
489  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
490  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
491 
492  v4sf sign_bit = _mm_castsi128_ps(emm0);
493  v4sf poly_mask = _mm_castsi128_ps(emm2);
494 #else
495  /* store the integer part of y in mm0:mm1 */
496  xmm2 = _mm_movehl_ps(xmm2, y);
497  mm2 = _mm_cvttps_pi32(y);
498  mm3 = _mm_cvttps_pi32(xmm2);
499 
500  /* j=(j+1) & (~1) (see the cephes sources) */
501  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
502  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
503  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
504  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
505 
506  y = _mm_cvtpi32x2_ps(mm2, mm3);
507 
508 
509  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
510  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
511 
512  /* get the swap sign flag in mm0:mm1 and the
513  polynom selection mask in mm2:mm3 */
514 
515  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
516  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
517  mm0 = _mm_slli_pi32(mm0, 29);
518  mm1 = _mm_slli_pi32(mm1, 29);
519 
520  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
521  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
522 
523  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
524  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
525 
526  v4sf sign_bit, poly_mask;
527  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
528  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
529  _mm_empty(); /* good-bye mmx */
530 #endif
531  /* The magic pass: "Extended precision modular arithmetic"
532  x = ((x - y * DP1) - y * DP2) - y * DP3; */
533  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
534  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
535  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
536  xmm1 = _mm_mul_ps(y, xmm1);
537  xmm2 = _mm_mul_ps(y, xmm2);
538  xmm3 = _mm_mul_ps(y, xmm3);
539  x = _mm_add_ps(x, xmm1);
540  x = _mm_add_ps(x, xmm2);
541  x = _mm_add_ps(x, xmm3);
542 
543  /* Evaluate the first polynom (0 <= x <= Pi/4) */
544  y = *(v4sf*)_ps_coscof_p0;
545  v4sf z = _mm_mul_ps(x,x);
546 
547  y = _mm_mul_ps(y, z);
548  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
549  y = _mm_mul_ps(y, z);
550  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
551  y = _mm_mul_ps(y, z);
552  y = _mm_mul_ps(y, z);
553  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
554  y = _mm_sub_ps(y, tmp);
555  y = _mm_add_ps(y, *(v4sf*)_ps_1);
556 
557  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
558 
559  v4sf y2 = *(v4sf*)_ps_sincof_p0;
560  y2 = _mm_mul_ps(y2, z);
561  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
562  y2 = _mm_mul_ps(y2, z);
563  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
564  y2 = _mm_mul_ps(y2, z);
565  y2 = _mm_mul_ps(y2, x);
566  y2 = _mm_add_ps(y2, x);
567 
568  /* select the correct result from the two polynoms */
569  xmm3 = poly_mask;
570  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
571  y = _mm_andnot_ps(xmm3, y);
572  y = _mm_add_ps(y,y2);
573  /* update the sign */
574  y = _mm_xor_ps(y, sign_bit);
575 
576  return y;
577 }
578 
579 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
580  it is almost as fast, and gives you a free cosine with your sine */
581 inline
582 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
583  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
584 #ifdef USE_SSE2
585  v4si emm0, emm2, emm4;
586 #else
587  v2si mm0, mm1, mm2, mm3, mm4, mm5;
588 #endif
589  sign_bit_sin = x;
590  /* take the absolute value */
591  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
592  /* extract the sign bit (upper one) */
593  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
594 
595  /* scale by 4/Pi */
596  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
597 
598 #ifdef USE_SSE2
599  /* store the integer part of y in emm2 */
600  emm2 = _mm_cvttps_epi32(y);
601 
602  /* j=(j+1) & (~1) (see the cephes sources) */
603  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
604  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
605  y = _mm_cvtepi32_ps(emm2);
606 
607  emm4 = emm2;
608 
609  /* get the swap sign flag for the sine */
610  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
611  emm0 = _mm_slli_epi32(emm0, 29);
612  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
613 
614  /* get the polynom selection mask for the sine*/
615  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
616  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
617  v4sf poly_mask = _mm_castsi128_ps(emm2);
618 #else
619  /* store the integer part of y in mm2:mm3 */
620  xmm3 = _mm_movehl_ps(xmm3, y);
621  mm2 = _mm_cvttps_pi32(y);
622  mm3 = _mm_cvttps_pi32(xmm3);
623 
624  /* j=(j+1) & (~1) (see the cephes sources) */
625  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
626  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
627  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
628  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
629 
630  y = _mm_cvtpi32x2_ps(mm2, mm3);
631 
632  mm4 = mm2;
633  mm5 = mm3;
634 
635  /* get the swap sign flag for the sine */
636  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
637  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
638  mm0 = _mm_slli_pi32(mm0, 29);
639  mm1 = _mm_slli_pi32(mm1, 29);
640  v4sf swap_sign_bit_sin;
641  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
642 
643  /* get the polynom selection mask for the sine */
644 
645  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
646  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
647  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
648  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
649  v4sf poly_mask;
650  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
651 #endif
652 
653  /* The magic pass: "Extended precision modular arithmetic"
654  x = ((x - y * DP1) - y * DP2) - y * DP3; */
655  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
656  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
657  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
658  xmm1 = _mm_mul_ps(y, xmm1);
659  xmm2 = _mm_mul_ps(y, xmm2);
660  xmm3 = _mm_mul_ps(y, xmm3);
661  x = _mm_add_ps(x, xmm1);
662  x = _mm_add_ps(x, xmm2);
663  x = _mm_add_ps(x, xmm3);
664 
665 #ifdef USE_SSE2
666  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
667  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
668  emm4 = _mm_slli_epi32(emm4, 29);
669  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
670 #else
671  /* get the sign flag for the cosine */
672  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
673  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
674  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
675  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
676  mm4 = _mm_slli_pi32(mm4, 29);
677  mm5 = _mm_slli_pi32(mm5, 29);
678  v4sf sign_bit_cos;
679  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
680  _mm_empty(); /* good-bye mmx */
681 #endif
682 
683  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
684 
685 
686  /* Evaluate the first polynom (0 <= x <= Pi/4) */
687  v4sf z = _mm_mul_ps(x,x);
688  y = *(v4sf*)_ps_coscof_p0;
689 
690  y = _mm_mul_ps(y, z);
691  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
692  y = _mm_mul_ps(y, z);
693  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
694  y = _mm_mul_ps(y, z);
695  y = _mm_mul_ps(y, z);
696  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
697  y = _mm_sub_ps(y, tmp);
698  y = _mm_add_ps(y, *(v4sf*)_ps_1);
699 
700  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
701 
702  v4sf y2 = *(v4sf*)_ps_sincof_p0;
703  y2 = _mm_mul_ps(y2, z);
704  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
705  y2 = _mm_mul_ps(y2, z);
706  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
707  y2 = _mm_mul_ps(y2, z);
708  y2 = _mm_mul_ps(y2, x);
709  y2 = _mm_add_ps(y2, x);
710 
711  /* select the correct result from the two polynoms */
712  xmm3 = poly_mask;
713  v4sf ysin2 = _mm_and_ps(xmm3, y2);
714  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
715  y2 = _mm_sub_ps(y2,ysin2);
716  y = _mm_sub_ps(y, ysin1);
717 
718  xmm1 = _mm_add_ps(ysin1,ysin2);
719  xmm2 = _mm_add_ps(y,y2);
720 
721  /* update the sign */
722  *s = _mm_xor_ps(xmm1, sign_bit_sin);
723  *c = _mm_xor_ps(xmm2, sign_bit_cos);
724 }
725 
726 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SSE_MATHFUN_H