IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
UniformRandomLCGReg.h
Go to the documentation of this file.
1 // UniformRandomLCGReg.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_UNIFORMRANDOMLCGREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_UNIFORMRANDOMLCGREG_H__
17 
20 
21 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AddReg.h>
24 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/SignedUnsignedOpReg.h>
27 
30 
31 #include <boost/type_traits/make_unsigned.hpp>
32 
33 namespace ipsdk {
34 namespace simd {
35 namespace detail {
36 
39 
42 template <>
44 {
45  typedef ipReal64 FloatType;
46 
47  static
48  IPSDK_FORCEINLINE
50  initSeed(ipUInt32 seed)
51  {
52  return _mm_set_epi32(seed, seed+1, seed, seed+1);
53  }
54 
55  static
56  IPSDK_FORCEINLINE
58  computeRangeMultiplier(ipInt32 nMin, ipInt32 nMax)
59  {
62 
63  const FloatType fMultiplier =
64  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
65  4294967296.0;
66  return AssignRegFloat::act(fMultiplier);
67  }
68 
69  static
70  IPSDK_FORCEINLINE
72  computeRangeOffset(ipInt32 nMin, ipInt32 nMax)
73  {
75  return AssignRegFloat::act(static_cast<FloatType>(nMin));
76  }
77 
78  // updates seed
79  static
80  IPSDK_FORCEINLINE
81  void
83  {
84  const ipUInt32 mult[4] = { 214013, 17405, 214013, 69069 };
85  const ipUInt32 gadd[4] = { 2531011, 10395331, 13737667, 1 };
86 
87  //__m128i cur_seed_split;
88  const __m128i multiplier = _mm_loadu_si128((__m128i*) mult);
89  const __m128i adder = _mm_loadu_si128((__m128i*) gadd);
90 
93  }
94 
95  static
96  IPSDK_FORCEINLINE
98  computeRandom32bits(
100  {
101  typedef AddReg<eInstructionSet::eIS_Sse2, ipInt32> AddRegInt32;
102  typedef AssignReg<eInstructionSet::eIS_Sse2, ipInt32> AssignRegInt32;
103  typedef BitwiseAndReg<eInstructionSet::eIS_Sse2, ipInt32> BitwiseAndRegInt32;
105 
106  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
107  const RegInt32 rMask3 = AssignRegInt32::act(3);
108 
109  act(rSeed);
110  RegInt32 r1 = _mm_srli_epi32(rSeed, 16);
111  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
112 
113  act(rSeed);
114  RegInt32 r2 = _mm_srli_epi32(rSeed, 16);
115  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
116 
117  act(rSeed);
118  RegInt32 r3 = _mm_srli_epi32(rSeed, 16);
119  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
120 
121  RegInt32 rRandVal = AddRegInt32::act(_mm_slli_epi32(r1, 17),
122  _mm_slli_epi32(r2, 2));
123  rRandVal = AddRegInt32::act(rRandVal,
124  BitwiseAndRegInt32::act(r3, rMask3));
125  return rRandVal;
126  }
127 
128  static
129  IPSDK_FORCEINLINE
131  act(const RegType<eInstructionSet::eIS_Sse2, FloatType>::Type& rRangeMultiplier,
134  {
141  typedef IsGreaterReg<eInstructionSet::eIS_Sse2, FloatType> IsGreaterRegFloat;
142  typedef BitwiseSelectReg<eInstructionSet::eIS_Sse2, FloatType> BitwiseSelectRegFloat;
143 
144  RegInt32 rRandVal = computeRandom32bits(rSeed);
145  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
146 
147  RegFloat rRandValF1 = _mm_cvtepi32_pd(rRandVal);
148  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
149  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
150 
151  // here, rRandValF1 belongs to range [0; range]
152 
153  // compute floor
154  const RegFloat rMaxInt32F = AssignRegFloat::act(2147483647.0);
155  RegFloat rGTMaxInt32F = IsGreaterRegFloat::act(rRandValF1, rMaxInt32F);
156  rRandValF1 = BitwiseSelectRegFloat::act(rGTMaxInt32F,
157  AddRegFloat::act(_mm_cvtepi32_pd(_mm_cvttpd_epi32(SubRegFloat::act(rRandValF1, rMaxInt32F))), rMaxInt32F),
158  _mm_cvtepi32_pd(_mm_cvttpd_epi32(rRandValF1)));
159 
160  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
161 
162  RegFloat rRandValF2 = _mm_cvtepi32_pd(_mm_srli_si128(rRandVal, 8));
163  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
164  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
165  // compute floor
166  rGTMaxInt32F = IsGreaterRegFloat::act(rRandValF2, rMaxInt32F);
167  rRandValF2 = BitwiseSelectRegFloat::act(rGTMaxInt32F,
168  AddRegFloat::act(_mm_cvtepi32_pd(_mm_cvttpd_epi32(SubRegFloat::act(rRandValF2, rMaxInt32F))), rMaxInt32F),
169  _mm_cvtepi32_pd(_mm_cvttpd_epi32(rRandValF2)));
170  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
171 
172  RegInt32 rRandVal1 = _mm_cvtpd_epi32(rRandValF1);
173  RegInt32 rRandVal2 = _mm_cvtpd_epi32(rRandValF2);
174 
175  __m128 rRandVal1F32 = _mm_castsi128_ps(rRandVal1);
176  __m128 rRandVal2F32 = _mm_castsi128_ps(rRandVal2);
177  __m128 rRandValF32 = _mm_shuffle_ps(rRandVal1F32, rRandVal2F32, _MM_SHUFFLE(1, 0, 1, 0));
178  return _mm_castps_si128(rRandValF32);
179  }
180 };
181 
184 template <>
186 {
187  typedef ipReal64 FloatType;
188 
189  static
190  IPSDK_FORCEINLINE
192  initSeed(ipUInt32 seed)
193  {
195  }
196 
197  static
198  IPSDK_FORCEINLINE
200  computeRangeMultiplier(ipUInt32 nMin, ipUInt32 nMax)
201  {
204 
205  const FloatType fMultiplier =
206  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
207  4294967296.0;
208  return AssignRegFloat::act(fMultiplier);
209  }
210 
211  static
212  IPSDK_FORCEINLINE
214  computeRangeOffset(ipUInt32 nMin, ipUInt32 nMax)
215  {
217  return AssignRegFloat::act(static_cast<FloatType>(nMin) - 2147483648.0);
218  }
219 
220  // updates seed
221  static
222  IPSDK_FORCEINLINE
223  void
225  {
227  }
228 
229  static
230  IPSDK_FORCEINLINE
232  act(const RegType<eInstructionSet::eIS_Sse2, FloatType>::Type& rRangeMultiplier,
235  {
238  typedef AssignReg<eInstructionSet::eIS_Sse2, ipInt32> AssignRegInt32;
239 
240  const RegInt32 rRandInt32 =
242  rRangeMultiplier,
243  rRangeOffset,
244  rSeed);
245 
246  const RegInt32 rMinInt32 =
247  AssignRegInt32::act(NumericLimits<ipInt32>::min());
248 
250  rRandInt32, rMinInt32);
251  return rRet;
252  }
253 };
254 
257 template <typename T>
259  typename boost::enable_if_c<
260  boost::is_integral<T>::value
261  && sizeof(T) == 2
262  >::type>
263 {
264  typedef ipReal32 FloatType;
265 
266  static
267  IPSDK_FORCEINLINE
269  initSeed(ipUInt32 seed)
270  {
272  }
273 
274  static
275  IPSDK_FORCEINLINE
277  computeRangeMultiplier(T tMin, T tMax)
278  {
281 
282  const FloatType fMultiplier =
283  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
284  65536.0f;
285  return AssignRegFloat::act(fMultiplier);
286  }
287 
288  static
289  IPSDK_FORCEINLINE
291  computeRangeOffset(T tMin, T tMax)
292  {
294 
295  return AssignRegFloat::act(static_cast<FloatType>(tMin));
296  }
297 
298  // updates seed
299  static
300  IPSDK_FORCEINLINE
301  void
303  {
305  }
306 
307  static
308  IPSDK_FORCEINLINE
309  void
310  computeRand16BitsOnInt32Vals(
316  {
318  typedef AddReg<eInstructionSet::eIS_Sse2, ipInt32> AddRegInt32;
322  typedef BitwiseAndReg<eInstructionSet::eIS_Sse2, ipInt32> BitwiseAndRegInt32;
323  typedef AssignReg<eInstructionSet::eIS_Sse2, ipInt32> AssignRegInt32;
324 
325  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
326  const RegInt32 rMask1 = AssignRegInt32::act(1);
327 
328  act(rSeed);
329  RegInt32 r1 = _mm_srli_epi32(rSeed, 16);
330  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
331 
332  act(rSeed);
333  RegInt32 r2 = _mm_srli_epi32(rSeed, 16);
334  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
335 
336  act(rSeed);
337  RegInt32 r3 = _mm_srli_epi32(rSeed, 16);
338  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
339 
340  RegFloat rRandValF = _mm_cvtepi32_ps(
341  AddRegInt32::act(_mm_slli_epi32(r1, 1), BitwiseAndRegInt32::act(r2, rMask1)));
342 
343  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
344  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
345  rRandValF = _mm_cvtepi32_ps(_mm_cvttps_epi32(rRandValF));
346  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
347  rVal1 = _mm_cvtps_epi32(rRandValF);
348 
349  rRandValF = _mm_cvtepi32_ps(
350  AddRegInt32::act(
351  _mm_slli_epi32(r3, 1),
352  BitwiseAndRegInt32::act(_mm_srli_epi32(r2, 1), rMask1)));
353  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
354  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
355  rRandValF = _mm_cvtepi32_ps(_mm_cvttps_epi32(rRandValF));
356  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
357  rVal2 = _mm_cvtps_epi32(rRandValF);
358  }
359 
360  static
361  IPSDK_FORCEINLINE
363  act(const RegType<eInstructionSet::eIS_Sse2, FloatType>::Type& rRangeMultiplier,
366  {
368  typedef typename RegType<eInstructionSet::eIS_Sse2, T>::Type RegT;
369 
370  RegInt32 rRandVal1, rRandVal2;
371  computeRand16BitsOnInt32Vals(
372  rRangeMultiplier,
373  rRangeOffset,
374  rSeed,
375  rRandVal1,
376  rRandVal2);
377 
378  RegT rRandVal;
379  CastReg<eInstructionSet::eIS_Sse2, ipInt32, T>::act(rRandVal1, rRandVal2, rRandVal);
380  return rRandVal;
381  }
382 };
383 
388 template <typename T>
390  typename boost::enable_if_c<
391  boost::is_integral<T>::value
392  && sizeof(T) == 1
393  >::type>
394 {
395  typedef ipReal32 FloatType;
396 
397  static
398  IPSDK_FORCEINLINE
400  initSeed(ipUInt32 seed)
401  {
403  }
404 
405  static
406  IPSDK_FORCEINLINE
408  computeRangeMultiplier(T tMin, T tMax)
409  {
413 
414  const FloatType fMultiplier =
415  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
416  32768.0f;
417  return AssignRegFloat::act(fMultiplier);
418  }
419 
420  static
421  IPSDK_FORCEINLINE
423  computeRangeOffset(T tMin, T tMax)
424  {
426 
427  return AssignRegFloat::act(static_cast<FloatType>(tMin));
428  }
429 
430  // updates seed
431  static
432  IPSDK_FORCEINLINE
433  void
435  {
437  }
438 
439  static
440  IPSDK_FORCEINLINE
442  computeRandomOn32BitsVal(
446  {
449  typedef BitwiseAndReg<eInstructionSet::eIS_Sse2, ipInt32> BitwiseAndRegInt32;
452  typedef AssignReg<eInstructionSet::eIS_Sse2, ipInt32> AssignRegInt32;
453 
454  const RegInt32 rMask = AssignRegInt32::act(0x7FFF);
455 
456  act(rSeed);
457  RegFloat rRandValF = _mm_cvtepi32_ps(
458  BitwiseAndRegInt32::act(_mm_srli_epi32(rSeed, 16), rMask));
459  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
460  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
461  rRandValF = _mm_cvtepi32_ps(_mm_cvttps_epi32(rRandValF));
462  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
463  return _mm_cvtps_epi32(rRandValF);
464  }
465 
466  static
467  IPSDK_FORCEINLINE
469  act(const RegType<eInstructionSet::eIS_Sse2, FloatType>::Type& rRangeMultiplier,
472  {
479  typedef typename RegType<eInstructionSet::eIS_Sse2, T>::Type RegT;
480  typedef AddReg<eInstructionSet::eIS_Sse2, ipInt32> AddRegInt32;
481  typedef BitwiseAndReg<eInstructionSet::eIS_Sse2, ipInt32> BitwiseAndRegInt32;
482  typedef AssignReg<eInstructionSet::eIS_Sse2, ipInt32> AssignRegInt32;
483 
484  const RegInt32 rRandVal1 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
485  const RegInt32 rRandVal2 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
486  const RegInt32 rRandVal3 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
487  const RegInt32 rRandVal4 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
488 
489  RegT rRet;
491  _mm_packs_epi32(rRandVal1, rRandVal2),
492  _mm_packs_epi32(rRandVal3, rRandVal4),
493  rRet);
494  return rRet;
495  }
496 };
497 
500 template <>
502 {
503  typedef ipReal64 FloatType;
504 
505  static
506  IPSDK_FORCEINLINE
508  initSeed(ipUInt32 seed)
509  {
511  }
512 
513  static
514  IPSDK_FORCEINLINE
516  computeRangeMultiplier(ipReal32 fMin, ipReal32 fMax)
517  {
520 
521  const FloatType fMultiplier =
522  (static_cast<FloatType>(fMax) - static_cast<FloatType>(fMin)) /
523  4294967295.0;
524  return AssignRegFloat::act(fMultiplier);
525  }
526 
527  static
528  IPSDK_FORCEINLINE
530  computeRangeOffset(ipReal32 fMin, ipReal32 fMax)
531  {
533  return AssignRegFloat::act(static_cast<FloatType>(fMin));
534  }
535 
536  // updates seed
537  static
538  IPSDK_FORCEINLINE
539  void
541  {
543  }
544 
545  static
546  IPSDK_FORCEINLINE
548  act(const RegType<eInstructionSet::eIS_Sse2, FloatType>::Type& rRangeMultiplier,
551  {
557 
558  const RegInt32 rRandVal =
560  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
561 
562  RegFloat rRandValF1 = _mm_cvtepi32_pd(rRandVal);
563  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
564  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
565  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
566 
567  RegFloat rRandValF2 = _mm_cvtepi32_pd(_mm_srli_si128(rRandVal, 8));
568  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
569  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
570  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
571 
572  return _mm_shuffle_ps(
573  _mm_cvtpd_ps(rRandValF1), _mm_cvtpd_ps(rRandValF2),
574  _MM_SHUFFLE(1, 0, 1, 0));
575  }
576 };
577 
580 
581 } // end of namespace detail
582 } // end of namespace simd
583 } // end of namespace ipsdk
584 
585 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_STD_ABSPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
Definition: SubReg.h:39
Definition: IsGreaterRegDecl.h:30
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
Definition: NumericLimits.h:27
Definition: CastReg.h:30
RegType class.
Definition: BitwiseSelectReg.h:30
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition: UniformRandomLCGReg.h:29
Definition of import/export macro for library.
template structure which is specialized to implement the arithmetic addition on 2 scalars or 2 regist...
Definition: AddReg.h:37
Definition: MulReg.h:39
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
Definition: RegType.h:29
Definition: SignedUnsignedOpReg.h:51
Definition: BitwiseAndReg.h:30
Definition: AssignRegDecl.h:31
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53