IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
UniformRandomLCGReg.h
Go to the documentation of this file.
1 // UniformRandomLCGReg.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX2_UNIFORMRANDOMLCGREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX2_UNIFORMRANDOMLCGREG_H__
17 
20 
21 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AddReg.h>
24 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/SignedUnsignedOpReg.h>
27 
30 
31 #include <boost/type_traits/make_unsigned.hpp>
32 
33 namespace ipsdk {
34 namespace simd {
35 namespace detail {
36 
39 
42 template <>
44 {
45  typedef ipReal64 FloatType;
46 
47  static
48  IPSDK_FORCEINLINE
50  initSeed(ipUInt32 seed)
51  {
52  return _mm256_set_epi32(seed, seed+1, seed, seed+1,
53  seed, seed+1, seed, seed+1);
54  }
55 
56  static
57  IPSDK_FORCEINLINE
59  computeRangeMultiplier(ipInt32 nMin, ipInt32 nMax)
60  {
63 
64  const FloatType fMultiplier =
65  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
66  4294967296.0;
67  return AssignRegFloat::act(fMultiplier);
68  }
69 
70  static
71  IPSDK_FORCEINLINE
73  computeRangeOffset(ipInt32 nMin, ipInt32 nMax)
74  {
76  return AssignRegFloat::act(static_cast<FloatType>(nMin));
77  }
78 
79  // updates seed
80  static
81  IPSDK_FORCEINLINE
82  void
84  {
85  const ipUInt32 mult[8] = { 214013, 17405, 214013, 69069, 1664525, 22695477, 1103515245, 134775813 };
86  const ipUInt32 gadd[8] = { 2531011, 10395331, 13737667, 1, 1013904223, 1, 12345, 1 };
87  //static const ipUInt32 mask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
88 
89  //__m128i cur_seed_split;
90  const __m256i multiplier = _mm256_loadu_si256((__m256i*) mult);
91  const __m256i adder = _mm256_loadu_si256((__m256i*) gadd);
92 
95  }
96 
97  static
98  IPSDK_FORCEINLINE
100  computeRandom32bits(
102  {
103  typedef AddReg<eInstructionSet::eIS_Avx2, ipInt32> AddRegInt32;
104  typedef AssignReg<eInstructionSet::eIS_Avx2, ipInt32> AssignRegInt32;
105  typedef BitwiseAndReg<eInstructionSet::eIS_Avx2, ipInt32> BitwiseAndRegInt32;
107 
108  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
109  const RegInt32 rMask3 = AssignRegInt32::act(3);
110 
111  act(rSeed);
112  RegInt32 r1 = _mm256_srli_epi32(rSeed, 16);
113  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
114 
115  act(rSeed);
116  RegInt32 r2 = _mm256_srli_epi32(rSeed, 16);
117  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
118 
119  act(rSeed);
120  RegInt32 r3 = _mm256_srli_epi32(rSeed, 16);
121  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
122 
123  RegInt32 rRandVal = AddRegInt32::act(_mm256_slli_epi32(r1, 17),
124  _mm256_slli_epi32(r2, 2));
125  rRandVal = AddRegInt32::act(rRandVal,
126  BitwiseAndRegInt32::act(r3, rMask3));
127  return rRandVal;
128  }
129 
130  static
131  IPSDK_FORCEINLINE
133  act(const RegType<eInstructionSet::eIS_Avx2, FloatType>::Type& rRangeMultiplier,
136  {
142 
143  RegInt32 rRandVal = computeRandom32bits(rSeed);
144  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
145 
146  RegFloat rRandValF1 =
147  _mm256_cvtepi32_pd(_mm256_extractf128_si256(rRandVal, 0));
148  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
149  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
150 
151  rRandValF1 = _mm256_floor_pd(rRandValF1);
152  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
153 
154  RegFloat rRandValF2 = _mm256_cvtepi32_pd(
155  _mm256_extractf128_si256(rRandVal, 1));
156  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
157  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
158 
159  rRandValF2 = _mm256_floor_pd(rRandValF2);
160  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
161 
162 
163  RegInt32 rRet = _mm256_castsi128_si256(_mm256_cvtpd_epi32(rRandValF1));
164  rRet = _mm256_inserti128_si256(
165  rRet, _mm256_cvtpd_epi32(rRandValF2), 1);
166 
167  return rRet;
168  }
169 };
170 
173 template <>
175 {
176  typedef ipReal64 FloatType;
177 
178  static
179  IPSDK_FORCEINLINE
181  initSeed(ipUInt32 seed)
182  {
184  }
185 
186  static
187  IPSDK_FORCEINLINE
189  computeRangeMultiplier(ipUInt32 nMin, ipUInt32 nMax)
190  {
193 
194  const FloatType fMultiplier =
195  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
196  4294967296.0;
197  return AssignRegFloat::act(fMultiplier);
198  }
199 
200  static
201  IPSDK_FORCEINLINE
203  computeRangeOffset(ipUInt32 nMin, ipUInt32 nMax)
204  {
206  return AssignRegFloat::act(static_cast<FloatType>(nMin) - 2147483648.0);
207  }
208 
209  // updates seed
210  static
211  IPSDK_FORCEINLINE
212  void
214  {
216  }
217 
218  static
219  IPSDK_FORCEINLINE
221  act(const RegType<eInstructionSet::eIS_Avx2, FloatType>::Type& rRangeMultiplier,
224  {
227  typedef AssignReg<eInstructionSet::eIS_Avx2, ipInt32> AssignRegInt32;
228 
229  const RegInt32 rRandInt32 =
231  rRangeMultiplier,
232  rRangeOffset,
233  rSeed);
234 
235  const RegInt32 rMinInt32 =
236  AssignRegInt32::act(NumericLimits<ipInt32>::min());
237 
238  const RegUInt32 rRet =
240  rRandInt32,
241  rMinInt32);
242 
243  return rRet;
244  }
245 };
246 
249 template <typename T>
251  typename boost::enable_if_c<
252  boost::is_integral<T>::value
253  && sizeof(T) == 2
254  >::type>
255 {
256  typedef ipReal32 FloatType;
257 
258  static
259  IPSDK_FORCEINLINE
261  initSeed(ipUInt32 seed)
262  {
264  }
265 
266  static
267  IPSDK_FORCEINLINE
269  computeRangeMultiplier(T tMin, T tMax)
270  {
273 
274  const FloatType fMultiplier =
275  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
276  65536.0f;
277  return AssignRegFloat::act(fMultiplier);
278  }
279 
280  static
281  IPSDK_FORCEINLINE
283  computeRangeOffset(T tMin, T tMax)
284  {
286 
287  return AssignRegFloat::act(static_cast<FloatType>(tMin));
288  }
289 
290  // updates seed
291  static
292  IPSDK_FORCEINLINE
293  void
295  {
297  }
298 
299  static
300  IPSDK_FORCEINLINE
301  void
302  computeRand16BitsOnInt32Vals(
308  {
310  typedef AddReg<eInstructionSet::eIS_Avx2, ipInt32> AddRegInt32;
314  typedef BitwiseAndReg<eInstructionSet::eIS_Avx2, ipInt32> BitwiseAndRegInt32;
315  typedef AssignReg<eInstructionSet::eIS_Avx2, ipInt32> AssignRegInt32;
316 
317  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
318  const RegInt32 rMask1 = AssignRegInt32::act(1);
319 
320  act(rSeed);
321  RegInt32 r1 = _mm256_srli_epi32(rSeed, 16);
322  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
323 
324  act(rSeed);
325  RegInt32 r2 = _mm256_srli_epi32(rSeed, 16);
326  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
327 
328  act(rSeed);
329  RegInt32 r3 = _mm256_srli_epi32(rSeed, 16);
330  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
331 
332  RegFloat rRandValF = _mm256_cvtepi32_ps(
333  AddRegInt32::act(_mm256_slli_epi32(r1, 1), BitwiseAndRegInt32::act(r2, rMask1)));
334 
335  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
336  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
337  rRandValF = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(rRandValF));
338  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
339  rVal1 = _mm256_cvtps_epi32(rRandValF);
340 
341  rRandValF = _mm256_cvtepi32_ps(
342  AddRegInt32::act(
343  _mm256_slli_epi32(r3, 1),
344  BitwiseAndRegInt32::act(_mm256_srli_epi32(r2, 1), rMask1)));
345  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
346  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
347  rRandValF = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(rRandValF));
348  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
349  rVal2 = _mm256_cvtps_epi32(rRandValF);
350  }
351 
352  static
353  IPSDK_FORCEINLINE
355  act(const RegType<eInstructionSet::eIS_Avx2, FloatType>::Type& rRangeMultiplier,
358  {
360  typedef typename RegType<eInstructionSet::eIS_Avx2, T>::Type RegT;
361 
362  RegInt32 rRandVal1, rRandVal2;
363  computeRand16BitsOnInt32Vals(
364  rRangeMultiplier,
365  rRangeOffset,
366  rSeed,
367  rRandVal1,
368  rRandVal2);
369 
370  RegT rRandVal;
371  CastReg<eInstructionSet::eIS_Avx2, ipInt32, T>::act(rRandVal1, rRandVal2, rRandVal);
372  return rRandVal;
373  }
374 };
375 
376 
379 template <typename T>
381  typename boost::enable_if_c<
382  boost::is_integral<T>::value
383  && sizeof(T) == 1
384  >::type>
385 {
386  typedef ipReal32 FloatType;
387 
388  static
389  IPSDK_FORCEINLINE
391  initSeed(ipUInt32 seed)
392  {
394  }
395 
396  static
397  IPSDK_FORCEINLINE
399  computeRangeMultiplier(T tMin, T tMax)
400  {
404 
405  const FloatType fMultiplier =
406  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
407  32768.0f;
408  return AssignRegFloat::act(fMultiplier);
409  }
410 
411  static
412  IPSDK_FORCEINLINE
414  computeRangeOffset(T tMin, T tMax)
415  {
417 
418  return AssignRegFloat::act(static_cast<FloatType>(tMin));
419  }
420 
421  // updates seed
422  static
423  IPSDK_FORCEINLINE
424  void
426  {
428  }
429 
430  static
431  IPSDK_FORCEINLINE
433  computeRandomOn32BitsVal(
437  {
440  typedef BitwiseAndReg<eInstructionSet::eIS_Avx2, ipInt32> BitwiseAndRegInt32;
443  typedef AssignReg<eInstructionSet::eIS_Avx2, ipInt32> AssignRegInt32;
444 
445  const RegInt32 rMask = AssignRegInt32::act(0x7FFF);
446 
447  act(rSeed);
448  RegFloat rRandValF = _mm256_cvtepi32_ps(
449  BitwiseAndRegInt32::act(_mm256_srli_epi32(rSeed, 16), rMask));
450  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
451  rRandValF = _mm256_floor_ps(rRandValF);
452  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
453  return _mm256_cvtps_epi32(rRandValF);
454  }
455 
456  static
457  IPSDK_FORCEINLINE
459  act(const RegType<eInstructionSet::eIS_Avx2, FloatType>::Type& rRangeMultiplier,
462  {
469  typedef typename RegType<eInstructionSet::eIS_Avx2, T>::Type RegT;
470  typedef AddReg<eInstructionSet::eIS_Avx2, ipInt32> AddRegInt32;
471  typedef BitwiseAndReg<eInstructionSet::eIS_Avx2, ipInt32> BitwiseAndRegInt32;
472  typedef AssignReg<eInstructionSet::eIS_Avx2, ipInt32> AssignRegInt32;
473 
474  const RegInt32 rRandVal1 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
475  const RegInt32 rRandVal2 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
476  const RegInt32 rRandVal3 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
477  const RegInt32 rRandVal4 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
478 
479  RegT rRet;
481  _mm256_packs_epi32(rRandVal1, rRandVal2),
482  _mm256_packs_epi32(rRandVal3, rRandVal4),
483  rRet);
484  return rRet;
485  }
486 };
487 
490 template <>
492 {
493  typedef ipReal64 FloatType;
494 
495  static
496  IPSDK_FORCEINLINE
498  initSeed(ipUInt32 seed)
499  {
501  }
502 
503  static
504  IPSDK_FORCEINLINE
506  computeRangeMultiplier(ipReal32 fMin, ipReal32 fMax)
507  {
510 
511  const FloatType fMultiplier =
512  (static_cast<FloatType>(fMax) - static_cast<FloatType>(fMin)) /
513  4294967295.0;
514  return AssignRegFloat::act(fMultiplier);
515  }
516 
517  static
518  IPSDK_FORCEINLINE
520  computeRangeOffset(ipReal32 fMin, ipReal32 fMax)
521  {
523  return AssignRegFloat::act(static_cast<FloatType>(fMin));
524  }
525 
526  // updates seed
527  static
528  IPSDK_FORCEINLINE
529  void
531  {
533  }
534 
535  static
536  IPSDK_FORCEINLINE
538  act(const RegType<eInstructionSet::eIS_Avx2, FloatType>::Type& rRangeMultiplier,
541  {
548 
549  const RegInt32 rRandVal =
551  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
552 
553  RegFloat rRandValF1 =
554  _mm256_cvtepi32_pd(_mm256_extractf128_si256(rRandVal, 0));
555  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
556  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
557  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
558 
559  RegFloat rRandValF2 = _mm256_cvtepi32_pd(
560  _mm256_extractf128_si256(rRandVal, 1));
561  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
562  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
563  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
564 
565  RegFloat32 rRet = _mm256_castps128_ps256(_mm256_cvtpd_ps(rRandValF1));
566  rRet = _mm256_insertf128_ps(
567  rRet, _mm256_cvtpd_ps(rRandValF2), 1);
568  return rRet;
569  }
570 };
571 
574 
575 } // end of namespace detail
576 } // end of namespace simd
577 } // end of namespace ipsdk
578 
579 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_STD_ABSPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
Definition: SubReg.h:39
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
Definition: NumericLimits.h:27
Definition: CastReg.h:30
RegType class.
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Advanced Vector Extensions 2.
Definition: InstructionSetTypes.h:48
Definition: UniformRandomLCGReg.h:29
Definition of import/export macro for library.
template structure which is specialized to implement the arithmetic addition on 2 scalars or 2 regist...
Definition: AddReg.h:37
Definition: MulReg.h:39
Definition: RegType.h:29
Definition: SignedUnsignedOpReg.h:51
Definition: BitwiseAndReg.h:30
Definition: AssignRegDecl.h:31
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53