IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
CosReg.h
Go to the documentation of this file.
1 // CosReg.h:
3 // -------------------
4 //
15 
16 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_COMMON_COSREG_H__
17 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_COMMON_COSREG_H__
18 
21 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AbsReg.h>
22 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AddReg.h>
25 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/PolynomReg.h>
35 
36 #include <boost/mpl/less.hpp>
37 
38 namespace ipsdk {
39 namespace simd {
40 namespace detail {
41 
44 
46 template <
47  eInstructionSet::domain IS,
48  eInstructionSet::domain ISFma/*,
49  typename boost::enable_if<
50  typename boost::mpl::less<
51  boost::mpl::int_<IS>,
52  boost::mpl::int_<eInstructionSet::eIS_Avx512>
53  >::type
54  >::type*/>
55 struct CosReg<IS, ISFma, ipReal32>
56 {
57  static IPSDK_FORCEINLINE
59  act(const typename RegType<IS, ipReal32>::Type& in)
60  {
61  typename RegType<IS, ipReal32>::Type out;
62  act(in, out);
63  return out;
64  }
65 
66  static IPSDK_FORCEINLINE
67  void
68  act(const typename RegType<IS, ipReal32>::Type& in,
69  typename RegType<IS, ipReal32>::Type& out)
70  {
71  typedef typename RegType<IS, ipReal32>::Type RegReal32;
72  typedef typename RegType<IS, ipInt32>::Type RegInt32;
73 
74  RegReal32 x = in;
75  RegReal32 xmm1, xmm2, xmm3;
76  AssignReg<IS, ipReal32>::act(xmm2, 0.0f);
77  RegInt32 emm0, emm2;
78 
79  /* take the absolute value */
81 
82  /* scale by 4/Pi */
83  RegReal32 cephesFOPI;
84  AssignReg<IS, ipReal32>::act(cephesFOPI, 1.27323954473516f); // 4 / M_PI
85  MulReg<IS, ipReal32>::act(x, cephesFOPI, out);
86 
87  /* store the integer part of out in mm0 */
89  /* j=(j+1) & (~1) (see the cephes sources) */
90  RegInt32 zero, one, invOne, two;
93  AssignReg<IS, ipInt32>::act(invOne, ~1);
95  AddReg<IS, ipInt32>::act(emm2, one, emm2);
96  BitwiseAndReg<IS, ipInt32>::act(emm2, invOne, emm2);
98 
99  SubReg<IS, ipInt32>::act(emm2, two, emm2);
100 
101  /* get the swap sign flag */
102  RegInt32 four;
104  BitwiseAndNotReg<IS, ipInt32>::act(four, emm2, emm0);
105  ShiftLeftReg<IS, ipInt32>::act(emm0, 29, emm0);
106 
107  /* get the polynom selection mask */
108  BitwiseAndReg<IS, ipInt32>::act(emm2, two, emm2);
109  typename RegMaskType<IS, ipInt32>::Type emm2Mask;
110  IsEqualReg<IS, ipInt32>::act(emm2, zero, emm2Mask);
111  CastReg<IS, ipInt32, ipInt32>::act(emm2Mask, emm2);
112 
113  RegReal32 sign_bit, poly_mask;
116 
117  /* The magic pass: "Extended precision modular arithmetic"
118  x = ((x - out * DP1) - out * DP2) - out * DP3; */
119  RegReal32 minusCephesDP1, minusCephesDP2, minusCephesDP3;
120  AssignReg<IS, ipReal32>::act(minusCephesDP1, -0.78515625f);
121  AssignReg<IS, ipReal32>::act(minusCephesDP2, -2.4187564849853515625e-4f);
122  AssignReg<IS, ipReal32>::act(minusCephesDP3, -3.77489497744594108e-8f);
123 
124  MulReg<IS, ipReal32>::act(out, minusCephesDP1, xmm1);
125  MulReg<IS, ipReal32>::act(out, minusCephesDP2, xmm2);
126  MulReg<IS, ipReal32>::act(out, minusCephesDP3, xmm3);
127  AddReg<IS, ipReal32>::act(x, xmm1, x);
128  AddReg<IS, ipReal32>::act(x, xmm2, x);
129  AddReg<IS, ipReal32>::act(x, xmm3, x);
130 
131  /* Evaluate the first polynom (0 <= x <= Pi/4) */
132  RegReal32 z;
133  MulReg<IS, ipReal32>::act(x, x, z);
135  z,
136  4.166664568298827E-002f, // coscof_p2
137  -1.388731625493765E-003f, // coscof_p1
138  2.443315711809948E-005f, // coscof_p0
139  out);
140 
141  MulReg<IS, ipReal32>::act(out, z, out);
142  MulReg<IS, ipReal32>::act(out, z, out);
143 
144  RegReal32 halfOne, oneReal32;
145  AssignReg<IS, ipReal32>::act(halfOne, 0.5f);
146  AssignReg<IS, ipReal32>::act(oneReal32, 1.0f);
147  RegReal32 tmp;
148  MulReg<IS, ipReal32>::act(z, halfOne, tmp);
149  SubReg<IS, ipReal32>::act(out, tmp, out);
150  AddReg<IS, ipReal32>::act(out, oneReal32, out);
151 
152  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
153  RegReal32 y2;
155  z,
156  -1.6666654611E-1f, // sincof_p2
157  8.3321608736E-3f, // sincof_p1
158  -1.9515295891E-4f, // sincof_p0
159  y2);
160  MulReg<IS, ipReal32>::act(y2, z, y2);
161  MulReg<IS, ipReal32>::act(y2, x, y2);
162  AddReg<IS, ipReal32>::act(y2, x, y2);
163 
164  /* select the correct result from the two polynoms */
165  BitwiseAndReg<IS, ipReal32>::act(poly_mask, y2, y2);
166  BitwiseAndNotReg<IS, ipReal32>::act(out, poly_mask, out);
167  AddReg<IS, ipReal32>::act(out, y2, out);
168  /* update the sign */
169  BitwiseXOrReg<IS, ipReal32>::act(out, sign_bit, out);
170  }
171 };
172 
175 
176 } // end of namespace detail
177 } // end of namespace simd
178 } // end of namespace ipsdk
179 
180 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_COMMON_SINREG_H__
template structure which is specialized to implement the computation of a polynom of degree 8 applied...
Definition: PolynomReg.h:43
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
Definition: BitwiseAndNotReg.h:30
Definition: SubReg.h:39
Definition: IsEqualRegDecl.h:35
IsEqualReg<eInstructionSet::domain instructionSet, typename T, typename Enable=void> structure...
Definition: CastReg.h:30
RegType class.
Definition: ShiftLeftReg.h:30
Definition: BitwiseXOrReg.h:30
Definition of import/export macro for library.
template structure which is specialized to implement the arithmetic addition on 2 scalars or 2 regist...
Definition: AddReg.h:37
Definition: MulReg.h:39
Definition: RegMaskType.h:29
Definition: BitwiseCastReg.h:29
template structure which is specialized to implement the computation of abs function on a scalar or a...
Definition: AbsReg.h:46
Definition: RegType.h:29
template structure which is specialized to implement the computation of cosine on a scalar or a regis...
Definition: CosReg.h:39
Definition: BitwiseAndReg.h:30
Definition: AssignRegDecl.h:31
float ipReal32
Base types definition.
Definition: BaseTypes.h:56