IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
avx_mathfun.h
1 /*
2  AVX implementation of sin, cos, sincos, exp and log
3 
4  Based on "sse_mathfun.h", by Julien Pommier
5  http://gruntthepeon.free.fr/ssemath/
6 
7  Copyright (C) 2012 Giovanni Garberoglio
8  Interdisciplinary Laboratory for Computational Science (LISC)
9  Fondazione Bruno Kessler and University of Trento
10  via Sommarive, 18
11  I-38123 Trento (Italy)
12 
13  This software is provided 'as-is', without any express or implied
14  warranty. In no event will the authors be held liable for any damages
15  arising from the use of this software.
16 
17  Permission is granted to anyone to use this software for any purpose,
18  including commercial applications, and to alter it and redistribute it
19  freely, subject to the following restrictions:
20 
21  1. The origin of this software must not be misrepresented; you must not
22  claim that you wrote the original software. If you use this software
23  in a product, an acknowledgment in the product documentation would be
24  appreciated but is not required.
25  2. Altered source versions must be plainly marked as such, and must not be
26  misrepresented as being the original software.
27  3. This notice may not be removed or altered from any source distribution.
28 
29  (this is the zlib license)
30 */
31 
32 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__
33 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__
34 
35 #include <immintrin.h>
36 
37 /* yes I know, the top of this file is quite ugly */
38 #ifdef _MSC_VER /* visual c++ */
39 # define ALIGN32_BEG __declspec(align(32))
40 # define ALIGN32_END
41 #else /* gcc or icc */
42 # define ALIGN32_BEG
43 # define ALIGN32_END __attribute__((aligned(32)))
44 #endif
45 
46 /* __m128 is ugly to write */
47 typedef __m256 v8sf; // vector of 8 float (avx)
48 typedef __m256i v8si; // vector of 8 int (avx)
49 typedef __m128i v4si; // vector of 8 int (avx)
50 
51 #define _PI32AVX_CONST(Name, Val) \
52  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
53 
54 _PI32AVX_CONST(1, 1);
55 _PI32AVX_CONST(inv1, ~1);
56 _PI32AVX_CONST(2, 2);
57 _PI32AVX_CONST(4, 4);
58 
59 
60 /* declare some AVX constants -- why can't I figure a better way to do that? */
61 #define _PS256_CONST(Name, Val) \
62  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
63 #define _PI32_CONST256(Name, Val) \
64  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
65 #define _PS256_CONST_TYPE(Name, Type, Val) \
66  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
67 
68 _PS256_CONST(1 , 1.0f);
69 _PS256_CONST(0p5, 0.5f);
70 /* the smallest non denormalized float number */
71 _PS256_CONST_TYPE(min_norm_pos, int, static_cast<int>(0x00800000));
72 _PS256_CONST_TYPE(mant_mask, int, static_cast<int>(0x7f800000));
73 _PS256_CONST_TYPE(inv_mant_mask, int, static_cast<int>(~0x7f800000));
74 
75 _PS256_CONST_TYPE(sign_mask, int, static_cast<int>(0x80000000));
76 _PS256_CONST_TYPE(inv_sign_mask, int, static_cast<int>(~0x80000000));
77 
78 _PI32_CONST256(0, 0);
79 _PI32_CONST256(1, 1);
80 _PI32_CONST256(inv1, ~1);
81 _PI32_CONST256(2, 2);
82 _PI32_CONST256(4, 4);
83 _PI32_CONST256(0x7f, 0x7f);
84 
85 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
86 _PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
87 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1f);
88 _PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
89 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1f);
90 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1f);
91 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1f);
92 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1f);
93 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1f);
94 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1f);
95 _PS256_CONST(cephes_log_q1, -2.12194440e-4f);
96 _PS256_CONST(cephes_log_q2, 0.693359375f);
97 
98 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
99 
100 typedef union imm_xmm_union {
101  v8si imm;
102  v4si xmm[2];
103 } imm_xmm_union;
104 
105 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
106  imm_xmm_union u __attribute__((aligned(32))); \
107  u.imm = imm_; \
108  xmm0_ = u.xmm[0]; \
109  xmm1_ = u.xmm[1]; \
110 }
111 
112 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
113  imm_xmm_union u __attribute__((aligned(32))); \
114  u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
115  }
116 
117 
118 #define AVX2_BITOP_USING_SSE2(fn) \
119 namespace {\
120 inline v8si custom_mm256_##fn(v8si x, int a) \
121 { \
122  /* use SSE2 instruction to perform the bitop AVX2 */ \
123  v4si x1, x2; \
124  v8si ret; \
125  COPY_IMM_TO_XMM(x, x1, x2); \
126  x1 = _mm_##fn(x1,a); \
127  x2 = _mm_##fn(x2,a); \
128  COPY_XMM_TO_IMM(x1, x2, ret); \
129  return(ret); \
130 } \
131 }
132 
133 //#warning "Using SSE2 to perform AVX2 bitshift ops"
134 AVX2_BITOP_USING_SSE2(slli_epi32)
135 AVX2_BITOP_USING_SSE2(srli_epi32)
136 
137 #define AVX2_INTOP_USING_SSE2(fn) \
138 namespace {\
139 inline v8si custom_mm256_##fn(v8si x, v8si y) \
140 { \
141  /* use SSE2 instructions to perform the AVX2 integer operation */ \
142  v4si x1, x2; \
143  v4si y1, y2; \
144  v8si ret; \
145  COPY_IMM_TO_XMM(x, x1, x2); \
146  COPY_IMM_TO_XMM(y, y1, y2); \
147  x1 = _mm_##fn(x1,y1); \
148  x2 = _mm_##fn(x2,y2); \
149  COPY_XMM_TO_IMM(x1, x2, ret); \
150  return(ret); \
151 }\
152 }
153 
154 //#warning "Using SSE2 to perform AVX2 integer ops"
155 //AVX2_INTOP_USING_SSE2(and_si128)
156 //AVX2_INTOP_USING_SSE2(andnot_si128)
157 //AVX2_INTOP_USING_SSE2(cmpeq_epi32)
158 AVX2_INTOP_USING_SSE2(sub_epi32)
159 AVX2_INTOP_USING_SSE2(add_epi32)
160 
161 #endif /* __AVX2__ */
162 
163 
164 /* natural logarithm computed for 8 simultaneous float
165  return NaN for x <= 0
166 */
167 inline
168 v8sf log256_ps(v8sf x) {
169  v8si imm0;
170  v8sf one = *(v8sf*)_ps256_1;
171 
172  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
173  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
174 
175  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
176 
177  // can be done with AVX2
178 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
179  imm0 = custom_mm256_srli_epi32(_mm256_castps_si256(x), 23);
180 #else
181  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
182 #endif
183 
184  /* keep only the fractional part */
185  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
186  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
187 
188  // this is again another AVX2 instruction
189 
190 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
191  imm0 = custom_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
192 #else
193  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
194 #endif
195  v8sf e = _mm256_cvtepi32_ps(imm0);
196 
197  e = _mm256_add_ps(e, one);
198 
199  /* part2:
200  if( x < SQRTHF ) {
201  e -= 1;
202  x = x + x - 1.0;
203  } else { x = x - 1.0; }
204  */
205  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
206  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
207  v8sf tmp = _mm256_and_ps(x, mask);
208  x = _mm256_sub_ps(x, one);
209  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
210  x = _mm256_add_ps(x, tmp);
211 
212  v8sf z = _mm256_mul_ps(x,x);
213 
214  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
215  y = _mm256_mul_ps(y, x);
216  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
217  y = _mm256_mul_ps(y, x);
218  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
219  y = _mm256_mul_ps(y, x);
220  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
221  y = _mm256_mul_ps(y, x);
222  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
223  y = _mm256_mul_ps(y, x);
224  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
225  y = _mm256_mul_ps(y, x);
226  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
227  y = _mm256_mul_ps(y, x);
228  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
229  y = _mm256_mul_ps(y, x);
230  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
231  y = _mm256_mul_ps(y, x);
232 
233  y = _mm256_mul_ps(y, z);
234 
235  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
236  y = _mm256_add_ps(y, tmp);
237 
238 
239  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
240  y = _mm256_sub_ps(y, tmp);
241 
242  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
243  x = _mm256_add_ps(x, y);
244  x = _mm256_add_ps(x, tmp);
245  //x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
246  x = _mm256_or_ps(
247  _mm256_and_ps(_mm256_setzero_ps(), invalid_mask),
248  _mm256_andnot_ps(invalid_mask, x)); // negative or null arg will be 0
249  return x;
250 }
251 
252 _PS256_CONST(exp_hi, 88.3762626647949f);
253 _PS256_CONST(exp_lo, -88.3762626647949f);
254 
255 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
256 _PS256_CONST(cephes_exp_C1, 0.693359375f);
257 _PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
258 
259 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
260 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
261 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
262 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
263 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
264 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
265 
266 inline
267 v8sf exp256_ps(v8sf x) {
268  v8sf tmp = _mm256_setzero_ps(), fx;
269  v8si imm0;
270  v8sf one = *(v8sf*)_ps256_1;
271 
272  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
273  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
274 
275  /* express exp(x) as exp(g + n*log(2)) */
276  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
277  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
278 
279  /* how to perform a floorf with SSE: just below */
280  //imm0 = _mm256_cvttps_epi32(fx);
281  //tmp = _mm256_cvtepi32_ps(imm0);
282 
283  tmp = _mm256_floor_ps(fx);
284 
285  /* if greater, substract 1 */
286  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
287  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
288  mask = _mm256_and_ps(mask, one);
289  fx = _mm256_sub_ps(tmp, mask);
290 
291  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
292  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
293  x = _mm256_sub_ps(x, tmp);
294  x = _mm256_sub_ps(x, z);
295 
296  z = _mm256_mul_ps(x,x);
297 
298  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
299  y = _mm256_mul_ps(y, x);
300  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
301  y = _mm256_mul_ps(y, x);
302  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
303  y = _mm256_mul_ps(y, x);
304  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
305  y = _mm256_mul_ps(y, x);
306  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
307  y = _mm256_mul_ps(y, x);
308  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
309  y = _mm256_mul_ps(y, z);
310  y = _mm256_add_ps(y, x);
311  y = _mm256_add_ps(y, one);
312 
313  /* build 2^n */
314  imm0 = _mm256_cvttps_epi32(fx);
315  // another two AVX2 instructions
316 
317 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
318  imm0 = custom_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
319  imm0 = custom_mm256_slli_epi32(imm0, 23);
320 #else
321  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
322  imm0 = _mm256_slli_epi32(imm0, 23);
323 #endif
324 
325  v8sf pow2n = _mm256_castsi256_ps(imm0);
326  y = _mm256_mul_ps(y, pow2n);
327  return y;
328 }
329 
330 _PS256_CONST(minus_cephes_DP1, -0.78515625f);
331 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
332 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
333 _PS256_CONST(sincof_p0, -1.9515295891E-4f);
334 _PS256_CONST(sincof_p1, 8.3321608736E-3f);
335 _PS256_CONST(sincof_p2, -1.6666654611E-1f);
336 _PS256_CONST(coscof_p0, 2.443315711809948E-005f);
337 _PS256_CONST(coscof_p1, -1.388731625493765E-003f);
338 _PS256_CONST(coscof_p2, 4.166664568298827E-002f);
339 _PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
340 
341 
342 /* evaluation of 8 sines at onces using AVX intrisics
343 
344  The code is the exact rewriting of the cephes sinf function.
345  Precision is excellent as long as x < 8192 (I did not bother to
346  take into account the special handling they have for greater values
347  -- it does not return garbage for arguments over 8192, though, but
348  the extra precision is missing).
349 
350  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
351  surprising but correct result.
352 
353 */
354 inline
355 v8sf sin256_ps(v8sf x) { // any x
356  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
357  v8si imm0, imm2;
358 
359 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
360  v4si imm0_1, imm0_2;
361  v4si imm2_1, imm2_2;
362 #endif
363 
364  sign_bit = x;
365  /* take the absolute value */
366  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
367  /* extract the sign bit (upper one) */
368  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
369 
370  /* scale by 4/Pi */
371  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
372 
373  /*
374  Here we start a series of integer operations, which are in the
375  realm of AVX2.
376  If we don't have AVX, let's perform them using SSE2 directives
377  */
378 
379 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700)
380  /* store the integer part of y in mm0 */
381  imm2 = _mm256_cvttps_epi32(y);
382  /* j=(j+1) & (~1) (see the cephes sources) */
383  // another two AVX2 instruction
384  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
385  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
386  y = _mm256_cvtepi32_ps(imm2);
387 
388  /* get the swap sign flag */
389  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
390  imm0 = _mm256_slli_epi32(imm0, 29);
391  /* get the polynom selection mask
392  there is one polynom for 0 <= x <= Pi/4
393  and another one for Pi/4<x<=Pi/2
394 
395  Both branches will be computed.
396  */
397  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
398  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
399 #else
400  /* we use SSE2 routines to perform the integer ops */
401  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
402 
403  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
404  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
405 
406  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
407  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
408 
409  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
410  y = _mm256_cvtepi32_ps(imm2);
411 
412  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
413  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
414 
415  imm0_1 = _mm_slli_epi32(imm0_1, 29);
416  imm0_2 = _mm_slli_epi32(imm0_2, 29);
417 
418  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
419 
420  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
421  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
422 
423  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
424  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
425 
426  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
427 #endif
428 
429  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
430  v8sf poly_mask = _mm256_castsi256_ps(imm2);
431  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
432 
433  /* The magic pass: "Extended precision modular arithmetic"
434  x = ((x - y * DP1) - y * DP2) - y * DP3; */
435  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
436  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
437  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
438  xmm1 = _mm256_mul_ps(y, xmm1);
439  xmm2 = _mm256_mul_ps(y, xmm2);
440  xmm3 = _mm256_mul_ps(y, xmm3);
441  x = _mm256_add_ps(x, xmm1);
442  x = _mm256_add_ps(x, xmm2);
443  x = _mm256_add_ps(x, xmm3);
444 
445  /* Evaluate the first polynom (0 <= x <= Pi/4) */
446  y = *(v8sf*)_ps256_coscof_p0;
447  v8sf z = _mm256_mul_ps(x,x);
448 
449  y = _mm256_mul_ps(y, z);
450  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
451  y = _mm256_mul_ps(y, z);
452  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
453  y = _mm256_mul_ps(y, z);
454  y = _mm256_mul_ps(y, z);
455  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
456  y = _mm256_sub_ps(y, tmp);
457  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
458 
459  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
460 
461  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
462  y2 = _mm256_mul_ps(y2, z);
463  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
464  y2 = _mm256_mul_ps(y2, z);
465  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
466  y2 = _mm256_mul_ps(y2, z);
467  y2 = _mm256_mul_ps(y2, x);
468  y2 = _mm256_add_ps(y2, x);
469 
470  /* select the correct result from the two polynoms */
471  xmm3 = poly_mask;
472  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
473  y = _mm256_andnot_ps(xmm3, y);
474  y = _mm256_add_ps(y,y2);
475  /* update the sign */
476  y = _mm256_xor_ps(y, sign_bit);
477 
478  return y;
479 }
480 
481 /* almost the same as sin_ps */
482 inline
483 v8sf cos256_ps(v8sf x) { // any x
484  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
485  v8si imm0, imm2;
486 
487 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
488  v4si imm0_1, imm0_2;
489  v4si imm2_1, imm2_2;
490 #endif
491 
492  /* take the absolute value */
493  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
494 
495  /* scale by 4/Pi */
496  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
497 
498 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700)
499  /* store the integer part of y in mm0 */
500  imm2 = _mm256_cvttps_epi32(y);
501  /* j=(j+1) & (~1) (see the cephes sources) */
502  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
503  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
504  y = _mm256_cvtepi32_ps(imm2);
505  imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
506 
507  /* get the swap sign flag */
508  imm0 = _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
509  imm0 = _mm256_slli_epi32(imm0, 29);
510  /* get the polynom selection mask */
511  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
512  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
513 #else
514 
515  /* we use SSE2 routines to perform the integer ops */
516  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
517 
518  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
519  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
520 
521  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
522  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
523 
524  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
525  y = _mm256_cvtepi32_ps(imm2);
526 
527  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
528  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
529 
530  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
531  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
532 
533  imm0_1 = _mm_slli_epi32(imm0_1, 29);
534  imm0_2 = _mm_slli_epi32(imm0_2, 29);
535 
536  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
537 
538  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
539  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
540 
541  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
542  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
543 
544  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
545 #endif
546 
547  v8sf sign_bit = _mm256_castsi256_ps(imm0);
548  v8sf poly_mask = _mm256_castsi256_ps(imm2);
549 
550  /* The magic pass: "Extended precision modular arithmetic"
551  x = ((x - y * DP1) - y * DP2) - y * DP3; */
552  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
553  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
554  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
555  xmm1 = _mm256_mul_ps(y, xmm1);
556  xmm2 = _mm256_mul_ps(y, xmm2);
557  xmm3 = _mm256_mul_ps(y, xmm3);
558  x = _mm256_add_ps(x, xmm1);
559  x = _mm256_add_ps(x, xmm2);
560  x = _mm256_add_ps(x, xmm3);
561 
562  /* Evaluate the first polynom (0 <= x <= Pi/4) */
563  y = *(v8sf*)_ps256_coscof_p0;
564  v8sf z = _mm256_mul_ps(x,x);
565 
566  y = _mm256_mul_ps(y, z);
567  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
568  y = _mm256_mul_ps(y, z);
569  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
570  y = _mm256_mul_ps(y, z);
571  y = _mm256_mul_ps(y, z);
572  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
573  y = _mm256_sub_ps(y, tmp);
574  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
575 
576  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
577 
578  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
579  y2 = _mm256_mul_ps(y2, z);
580  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
581  y2 = _mm256_mul_ps(y2, z);
582  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
583  y2 = _mm256_mul_ps(y2, z);
584  y2 = _mm256_mul_ps(y2, x);
585  y2 = _mm256_add_ps(y2, x);
586 
587  /* select the correct result from the two polynoms */
588  xmm3 = poly_mask;
589  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
590  y = _mm256_andnot_ps(xmm3, y);
591  y = _mm256_add_ps(y,y2);
592  /* update the sign */
593  y = _mm256_xor_ps(y, sign_bit);
594 
595  return y;
596 }
597 
598 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
599  it is almost as fast, and gives you a free cosine with your sine */
600 inline
601 void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
602 
603  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
604  v8si imm0, imm2, imm4;
605 
606 #if !(defined __linux__ && defined __AVX2__) && !(defined(_MSC_VER) && _MSC_VER >= 1700)
607  v4si imm0_1, imm0_2;
608  v4si imm2_1, imm2_2;
609  v4si imm4_1, imm4_2;
610 #endif
611 
612  sign_bit_sin = x;
613  /* take the absolute value */
614  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
615  /* extract the sign bit (upper one) */
616  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
617 
618  /* scale by 4/Pi */
619  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
620 
621 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700)
622  /* store the integer part of y in imm2 */
623  imm2 = _mm256_cvttps_epi32(y);
624 
625  /* j=(j+1) & (~1) (see the cephes sources) */
626  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
627  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
628 
629  y = _mm256_cvtepi32_ps(imm2);
630  imm4 = imm2;
631 
632  /* get the swap sign flag for the sine */
633  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
634  imm0 = _mm256_slli_epi32(imm0, 29);
635  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
636 
637  /* get the polynom selection mask for the sine*/
638  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
639  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
640  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
641 #else
642  /* we use SSE2 routines to perform the integer ops */
643  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
644 
645  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
646  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
647 
648  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
649  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
650 
651  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
652  y = _mm256_cvtepi32_ps(imm2);
653 
654  imm4_1 = imm2_1;
655  imm4_2 = imm2_2;
656 
657  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
658  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
659 
660  imm0_1 = _mm_slli_epi32(imm0_1, 29);
661  imm0_2 = _mm_slli_epi32(imm0_2, 29);
662 
663  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
664 
665  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
666  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
667 
668  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
669  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
670 
671  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
672 #endif
673  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
674  v8sf poly_mask = _mm256_castsi256_ps(imm2);
675 
676  /* The magic pass: "Extended precision modular arithmetic"
677  x = ((x - y * DP1) - y * DP2) - y * DP3; */
678  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
679  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
680  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
681  xmm1 = _mm256_mul_ps(y, xmm1);
682  xmm2 = _mm256_mul_ps(y, xmm2);
683  xmm3 = _mm256_mul_ps(y, xmm3);
684  x = _mm256_add_ps(x, xmm1);
685  x = _mm256_add_ps(x, xmm2);
686  x = _mm256_add_ps(x, xmm3);
687 
688 #if (defined __linux__ && defined __AVX2__) || (defined(_MSC_VER) && _MSC_VER >= 1700)
689  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
690  imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
691  imm4 = _mm256_slli_epi32(imm4, 29);
692 #else
693  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
694  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
695 
696  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
697  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
698 
699  imm4_1 = _mm_slli_epi32(imm4_1, 29);
700  imm4_2 = _mm_slli_epi32(imm4_2, 29);
701 
702  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
703 #endif
704 
705  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
706 
707  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
708 
709  /* Evaluate the first polynom (0 <= x <= Pi/4) */
710  v8sf z = _mm256_mul_ps(x,x);
711  y = *(v8sf*)_ps256_coscof_p0;
712 
713  y = _mm256_mul_ps(y, z);
714  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
715  y = _mm256_mul_ps(y, z);
716  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
717  y = _mm256_mul_ps(y, z);
718  y = _mm256_mul_ps(y, z);
719  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
720  y = _mm256_sub_ps(y, tmp);
721  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
722 
723  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
724 
725  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
726  y2 = _mm256_mul_ps(y2, z);
727  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
728  y2 = _mm256_mul_ps(y2, z);
729  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
730  y2 = _mm256_mul_ps(y2, z);
731  y2 = _mm256_mul_ps(y2, x);
732  y2 = _mm256_add_ps(y2, x);
733 
734  /* select the correct result from the two polynoms */
735  xmm3 = poly_mask;
736  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
737  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
738  y2 = _mm256_sub_ps(y2,ysin2);
739  y = _mm256_sub_ps(y, ysin1);
740 
741  xmm1 = _mm256_add_ps(ysin1,ysin2);
742  xmm2 = _mm256_add_ps(y,y2);
743 
744  /* update the sign */
745  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
746  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
747 }
748 
749 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_AVX_MATHFUN_H__
Definition: avx_mathfun.h:100