IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
MulReg.h
Go to the documentation of this file.
1 // MulReg.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_MULREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_MULREG_H__
17 
21 
22 namespace ipsdk {
23 namespace simd {
24 namespace detail {
25 
28 
29 static IPSDK_FORCEINLINE
30 __m128i
31 _custom_mm_mullo_epi8(__m128i a, __m128i b)
32 {
33  /*
34  // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
35  __m128i aodd = _mm_srli_epi16(a,8); // odd numbered elements of a
36  __m128i bodd = _mm_srli_epi16(b,8); // odd numbered elements of b
37  __m128i muleven = _mm_mullo_epi16(a,b); // product of even numbered elements
38  __m128i mulodd = _mm_mullo_epi16(aodd,bodd); // product of odd numbered elements
39  mulodd = _mm_slli_epi16(mulodd,8); // put odd numbered elements back in place
40  __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions
41  __m128i product = selectb(mask,muleven,mulodd); // interleave even and odd
42  return product;*/
43  __m128i mask = _mm_set1_epi16(0xFF);
44  return _mm_or_si128 ( _mm_and_si128(mask, _mm_mullo_epi16(a, b))
45  , _mm_slli_epi16
46  ( _mm_and_si128 ( mask
47  , _mm_mullo_epi16
48  ( _mm_srli_epi16(a, 8)
49  , _mm_srli_epi16(b, 8)
50  )
51  )
52  , 8
53  )
54  );
55 }
56 
57 static IPSDK_FORCEINLINE
58 void
59 _custom_mm_mullo_epi8(__m128i a, __m128i b, __m128i& out)
60 {
61  __m128i mask = _mm_set1_epi16(0xFF);
62  out = _mm_or_si128 ( _mm_and_si128(mask, _mm_mullo_epi16(a, b))
63  , _mm_slli_epi16
64  ( _mm_and_si128 ( mask
65  , _mm_mullo_epi16
66  ( _mm_srli_epi16(a, 8)
67  , _mm_srli_epi16(b, 8)
68  )
69  )
70  , 8
71  )
72  );
73 }
74 
75 IPSDK_FORCEINLINE
76 __m128i
77 _custom_mm_mullo_epi32(__m128i a, __m128i b)
78 {
79  /*__m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1)
80  __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1)
81  __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0)
82  __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
83  __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0)
84  __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2)
85 
86  return _mm_unpacklo_epi64(prod01,prod23); // (ab3,ab2,ab1,ab0)*/
87 
88  /*__m128 af = _mm_castsi128_ps(a);
89  __m128 bf = _mm_castsi128_ps(b);
90  __m128 resf = _mm_mul_ps(af, bf);
91  return _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));*/
92 
93  return _mm_or_si128(
94  _mm_and_si128(
95  _mm_mul_epu32(a, b),
96  _mm_setr_epi32(0xffffffff,0,0xffffffff,0)
97  )
98  , _mm_slli_si128(
99  _mm_and_si128(
100  _mm_mul_epu32( _mm_srli_si128(a, 4)
101  , _mm_srli_si128(b, 4)
102  )
103  , _mm_setr_epi32(0xffffffff,0,0xffffffff,0)
104  )
105  , 4
106  )
107  );
108 }
109 
110 IPSDK_FORCEINLINE
111 void
112 _custom_mm_mullo_epi32(__m128i a, __m128i b, __m128i& out)
113 {
114  out = _mm_or_si128(
115  _mm_and_si128(
116  _mm_mul_epu32(a, b),
117  _mm_setr_epi32(0xffffffff,0,0xffffffff,0)
118  )
119  , _mm_slli_si128(
120  _mm_and_si128(
121  _mm_mul_epu32( _mm_srli_si128(a, 4)
122  , _mm_srli_si128(b, 4)
123  )
124  , _mm_setr_epi32(0xffffffff,0,0xffffffff,0)
125  )
126  , 4
127  )
128  );
129 }
130 
133 template <typename T>
135  typename boost::enable_if_c<boost::is_integral<T>::value
136  && sizeof(T) == 1>::type>
137 {
138  static IPSDK_FORCEINLINE
139  typename Sse2Type<T>::Type
140  act(const typename Sse2Type<T>::Type& in1,
141  const typename Sse2Type<T>::Type& in2)
142  {
143  return _custom_mm_mullo_epi8(in1, in2);
144  }
145 
146  static IPSDK_FORCEINLINE
147  void
148  act(const typename Sse2Type<T>::Type& in1,
149  const typename Sse2Type<T>::Type& in2,
150  typename Sse2Type<T>::Type& out)
151  {
152  _custom_mm_mullo_epi8(in1, in2, out);
153  }
154 };
155 
158 template <typename T>
160  typename boost::enable_if_c<boost::is_integral<T>::value
161  && sizeof(T) == 2>::type>
162 {
163  static IPSDK_FORCEINLINE
164  typename Sse2Type<T>::Type
165  act(const typename Sse2Type<T>::Type& in1,
166  const typename Sse2Type<T>::Type& in2)
167  {
168  return _mm_mullo_epi16(in1, in2);
169  }
170 
171  static IPSDK_FORCEINLINE
172  void
173  act(const typename Sse2Type<T>::Type& in1,
174  const typename Sse2Type<T>::Type& in2,
175  typename Sse2Type<T>::Type& out)
176  {
177  out = _mm_mullo_epi16(in1, in2);
178  }
179 };
180 
183 template <typename T>
185  typename boost::enable_if_c<boost::is_integral<T>::value
186  && sizeof(T) == 4>::type>
187 {
188  static IPSDK_FORCEINLINE
189  typename Sse2Type<T>::Type
190  act(const typename Sse2Type<T>::Type& in1,
191  const typename Sse2Type<T>::Type& in2)
192  {
193  return _custom_mm_mullo_epi32(in1, in2);
194  }
195 
196  static IPSDK_FORCEINLINE
197  void
198  act(const typename Sse2Type<T>::Type& in1,
199  const typename Sse2Type<T>::Type& in2,
200  typename Sse2Type<T>::Type& out)
201  {
202  _custom_mm_mullo_epi32(in1, in2, out);
203  }
204 };
205 
208 template <>
210 {
211  static IPSDK_FORCEINLINE
213  act(const Sse2Type<ipReal32>::Type& in1,
214  const Sse2Type<ipReal32>::Type& in2)
215  {
216  return _mm_mul_ps(in1, in2);
217  }
218 
219  static IPSDK_FORCEINLINE
220  void
221  act(const Sse2Type<ipReal32>::Type& in1,
222  const Sse2Type<ipReal32>::Type& in2,
224  {
225  out = _mm_mul_ps(in1, in2);
226  }
227 };
228 
231 template <>
233 {
234  static IPSDK_FORCEINLINE
236  act(const Sse2Type<ipReal64>::Type& in1,
237  const Sse2Type<ipReal64>::Type& in2)
238  {
239  return _mm_mul_pd(in1, in2);
240  }
241 
242  static IPSDK_FORCEINLINE
243  void
244  act(const Sse2Type<ipReal64>::Type& in1,
245  const Sse2Type<ipReal64>::Type& in2,
247  {
248  out = _mm_mul_pd(in1, in2);
249  }
250 };
251 
254 
255 } // end of namespace detail
256 } // end of namespace simd
257 } // end of namespace ipsdk
258 
259 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_MULREG_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Predefined types for Sse2 instruction set management.
Definition of import/export macro for library.
Definition: MulReg.h:39
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
structure used to retrieve SSE2 type associated to a base type
Definition: Sse2Types.h:32
float ipReal32
Base types definition.
Definition: BaseTypes.h:56