IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
UniformRandomLCGReg.h
Go to the documentation of this file.
1 // UniformRandomLCGReg.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_UNIFORMRANDOMLCGREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_UNIFORMRANDOMLCGREG_H__
17 
20 
21 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AddReg.h>
24 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/FloorReg.h>
25 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/SignedUnsignedOpReg.h>
28 
31 
32 #include <boost/type_traits/make_unsigned.hpp>
33 
34 namespace ipsdk {
35 namespace simd {
36 namespace detail {
37 
40 
43 template <>
45 {
46  typedef ipReal64 FloatType;
47 
48  static
49  IPSDK_FORCEINLINE
51  initSeed(ipUInt32 seed)
52  {
53  return _mm512_set_epi32(seed, seed+1, seed, seed+1,
54  seed, seed+1, seed, seed+1,
55  seed, seed+1, seed, seed+1,
56  seed, seed+1, seed, seed+1);
57  }
58 
59  static
60  IPSDK_FORCEINLINE
62  computeRangeMultiplier(ipInt32 nMin, ipInt32 nMax)
63  {
66 
67  const FloatType fMultiplier =
68  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
69  4294967296.0;
70  return AssignRegFloat::act(fMultiplier);
71  }
72 
73  static
74  IPSDK_FORCEINLINE
76  computeRangeOffset(ipInt32 nMin, ipInt32 nMax)
77  {
79  return AssignRegFloat::act(static_cast<FloatType>(nMin));
80  }
81 
82  // updates seed
83  static
84  IPSDK_FORCEINLINE
85  void
87  {
88  const ipUInt32 mult[16] = { 214013, 17405, 214013, 69069, 1664525, 22695477, 1103515245, 134775813, 214013, 17405, 214013, 69069, 1664525, 22695477, 1103515245, 134775813 };
89  const ipUInt32 gadd[16] = { 2531011, 10395331, 13737667, 1, 1013904223, 1, 12345, 1, 2531011, 10395331, 13737667, 1, 1013904223, 1, 12345, 1 };
90  //static const ipUInt32 mask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
91 
92  //__m128i cur_seed_split;
93  const __m512i multiplier = _mm512_loadu_si512((__m512i*) mult);
94  const __m512i adder = _mm512_loadu_si512((__m512i*) gadd);
95 
98  }
99 
100  static
101  IPSDK_FORCEINLINE
103  computeRandom32bits(
105  {
108  typedef BitwiseAndReg<eInstructionSet::eIS_Avx512, ipInt32> BitwiseAndRegInt32;
110 
111  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
112  const RegInt32 rMask3 = AssignRegInt32::act(3);
113 
114  act(rSeed);
115  RegInt32 r1 = _mm512_srli_epi32(rSeed, 16);
116  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
117 
118  act(rSeed);
119  RegInt32 r2 = _mm512_srli_epi32(rSeed, 16);
120  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
121 
122  act(rSeed);
123  RegInt32 r3 = _mm512_srli_epi32(rSeed, 16);
124  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
125 
126  RegInt32 rRandVal = AddRegInt32::act(_mm512_slli_epi32(r1, 17),
127  _mm512_slli_epi32(r2, 2));
128  rRandVal = AddRegInt32::act(rRandVal,
129  BitwiseAndRegInt32::act(r3, rMask3));
130  return rRandVal;
131  }
132 
133  static
134  IPSDK_FORCEINLINE
136  act(const RegType<eInstructionSet::eIS_Avx512, FloatType>::Type& rRangeMultiplier,
139  {
146 
147  RegInt32 rRandVal = computeRandom32bits(rSeed);
148  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
149 
150  RegFloat rRandValF1 =
151  _mm512_cvtepi32_pd(_mm512_castsi512_si256(rRandVal));
152  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
153  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
154 
155  rRandValF1 = FloorRegFloat::act(rRandValF1);
156  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
157 
158  const __m512i duplicateHiMask = _mm512_setr_epi32(
159  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
160 
161  RegFloat rRandValF2 = _mm512_cvtepi32_pd(
162  _mm512_castsi512_si256(_mm512_permutex2var_epi32(
163  rRandVal, duplicateHiMask, rRandVal)));
164  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
165  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
166 
167  rRandValF2 = FloorRegFloat::act(rRandValF2);
168  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
169 
170 
171  RegInt32 rRet = _mm512_castsi256_si512(_mm512_cvtpd_epi32(rRandValF1));
172  rRet = _mm512_inserti64x4(
173  rRet, _mm512_cvtpd_epi32(rRandValF2), 1);
174 
175  return rRet;
176  }
177 };
178 
181 template <>
183 {
184  typedef ipReal64 FloatType;
185 
186  static
187  IPSDK_FORCEINLINE
189  initSeed(ipUInt32 seed)
190  {
192  }
193 
194  static
195  IPSDK_FORCEINLINE
197  computeRangeMultiplier(ipUInt32 nMin, ipUInt32 nMax)
198  {
201 
202  const FloatType fMultiplier =
203  (static_cast<FloatType>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
204  4294967296.0;
205  return AssignRegFloat::act(fMultiplier);
206  }
207 
208  static
209  IPSDK_FORCEINLINE
211  computeRangeOffset(ipUInt32 nMin, ipUInt32 nMax)
212  {
214  return AssignRegFloat::act(static_cast<FloatType>(nMin) - 2147483648.0);
215  }
216 
217  // updates seed
218  static
219  IPSDK_FORCEINLINE
220  void
222  {
224  }
225 
226  static
227  IPSDK_FORCEINLINE
229  act(const RegType<eInstructionSet::eIS_Avx512, FloatType>::Type& rRangeMultiplier,
232  {
236 
237  const RegInt32 rRandInt32 =
239  rRangeMultiplier,
240  rRangeOffset,
241  rSeed);
242 
243  const RegInt32 rMinInt32 =
244  AssignRegInt32::act(NumericLimits<ipInt32>::min());
245 
246  const RegUInt32 rRet =
248  rRandInt32,
249  rMinInt32);
250 
251  return rRet;
252  }
253 };
254 
257 template <typename T>
259  typename boost::enable_if_c<
260  boost::is_integral<T>::value
261  && sizeof(T) == 2
262  >::type>
263 {
264  typedef ipReal32 FloatType;
265 
266  static
267  IPSDK_FORCEINLINE
269  initSeed(ipUInt32 seed)
270  {
272  }
273 
274  static
275  IPSDK_FORCEINLINE
277  computeRangeMultiplier(T tMin, T tMax)
278  {
281 
282  const FloatType fMultiplier =
283  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
284  65536.0f;
285  return AssignRegFloat::act(fMultiplier);
286  }
287 
288  static
289  IPSDK_FORCEINLINE
291  computeRangeOffset(T tMin, T tMax)
292  {
294 
295  return AssignRegFloat::act(static_cast<FloatType>(tMin));
296  }
297 
298  // updates seed
299  static
300  IPSDK_FORCEINLINE
301  void
303  {
305  }
306 
307  static
308  IPSDK_FORCEINLINE
309  void
310  computeRand16BitsOnInt32Vals(
316  {
322  typedef BitwiseAndReg<eInstructionSet::eIS_Avx512, ipInt32> BitwiseAndRegInt32;
324 
325  const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
326  const RegInt32 rMask1 = AssignRegInt32::act(1);
327 
328  act(rSeed);
329  RegInt32 r1 = _mm512_srli_epi32(rSeed, 16);
330  r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
331 
332  act(rSeed);
333  RegInt32 r2 = _mm512_srli_epi32(rSeed, 16);
334  r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
335 
336  act(rSeed);
337  RegInt32 r3 = _mm512_srli_epi32(rSeed, 16);
338  r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
339 
340  RegFloat rRandValF = _mm512_cvtepi32_ps(
341  AddRegInt32::act(_mm512_slli_epi32(r1, 1), BitwiseAndRegInt32::act(r2, rMask1)));
342 
343  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
344  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
345  rRandValF = _mm512_cvtepi32_ps(_mm512_cvttps_epi32(rRandValF));
346  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
347  rVal1 = _mm512_cvtps_epi32(rRandValF);
348 
349  rRandValF = _mm512_cvtepi32_ps(
350  AddRegInt32::act(
351  _mm512_slli_epi32(r3, 1),
352  BitwiseAndRegInt32::act(_mm512_srli_epi32(r2, 1), rMask1)));
353  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
354  // as rRandValF only contains positive value, the following tip is equivalent to call "floor"
355  rRandValF = _mm512_cvtepi32_ps(_mm512_cvttps_epi32(rRandValF));
356  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
357  rVal2 = _mm512_cvtps_epi32(rRandValF);
358  }
359 
360  static
361  IPSDK_FORCEINLINE
363  act(const RegType<eInstructionSet::eIS_Avx512, FloatType>::Type& rRangeMultiplier,
366  {
368  typedef typename RegType<eInstructionSet::eIS_Avx512, T>::Type RegT;
369 
370  RegInt32 rRandVal1, rRandVal2;
371  computeRand16BitsOnInt32Vals(
372  rRangeMultiplier,
373  rRangeOffset,
374  rSeed,
375  rRandVal1,
376  rRandVal2);
377 
378  RegT rRandVal;
379  CastReg<eInstructionSet::eIS_Avx512, ipInt32, T>::act(rRandVal1, rRandVal2, rRandVal);
380  return rRandVal;
381  }
382 };
383 
384 
387 template <typename T>
389  typename boost::enable_if_c<
390  boost::is_integral<T>::value
391  && sizeof(T) == 1
392  >::type>
393 {
394  typedef ipReal32 FloatType;
395 
396  static
397  IPSDK_FORCEINLINE
399  initSeed(ipUInt32 seed)
400  {
402  }
403 
404  static
405  IPSDK_FORCEINLINE
407  computeRangeMultiplier(T tMin, T tMax)
408  {
412 
413  const FloatType fMultiplier =
414  (static_cast<FloatType>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
415  32768.0f;
416  return AssignRegFloat::act(fMultiplier);
417  }
418 
419  static
420  IPSDK_FORCEINLINE
422  computeRangeOffset(T tMin, T tMax)
423  {
425 
426  return AssignRegFloat::act(static_cast<FloatType>(tMin));
427  }
428 
429  // updates seed
430  static
431  IPSDK_FORCEINLINE
432  void
434  {
436  }
437 
438  static
439  IPSDK_FORCEINLINE
441  computeRandomOn32BitsVal(
445  {
448  typedef BitwiseAndReg<eInstructionSet::eIS_Avx512, ipInt32> BitwiseAndRegInt32;
453 
454  const RegInt32 rMask = AssignRegInt32::act(0x7FFF);
455 
456  act(rSeed);
457  RegFloat rRandValF = _mm512_cvtepi32_ps(
458  BitwiseAndRegInt32::act(_mm512_srli_epi32(rSeed, 16), rMask));
459  rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
460  rRandValF = FloorRegFloat::act(rRandValF);
461  rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
462  return _mm512_cvtps_epi32(rRandValF);
463  }
464 
465  static
466  IPSDK_FORCEINLINE
468  act(const RegType<eInstructionSet::eIS_Avx512, FloatType>::Type& rRangeMultiplier,
471  {
478  typedef typename RegType<eInstructionSet::eIS_Avx512, T>::Type RegT;
480  typedef BitwiseAndReg<eInstructionSet::eIS_Avx512, ipInt32> BitwiseAndRegInt32;
482 
483  const RegInt32 rRandVal1 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
484  const RegInt32 rRandVal2 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
485  const RegInt32 rRandVal3 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
486  const RegInt32 rRandVal4 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
487 
488  RegT rRet;
490  _mm512_packs_epi32(rRandVal1, rRandVal2),
491  _mm512_packs_epi32(rRandVal3, rRandVal4),
492  rRet);
493  return rRet;
494  }
495 };
496 
499 template <>
501 {
502  typedef ipReal64 FloatType;
503 
504  static
505  IPSDK_FORCEINLINE
507  initSeed(ipUInt32 seed)
508  {
510  }
511 
512  static
513  IPSDK_FORCEINLINE
515  computeRangeMultiplier(ipReal32 fMin, ipReal32 fMax)
516  {
519 
520  const FloatType fMultiplier =
521  (static_cast<FloatType>(fMax) - static_cast<FloatType>(fMin)) /
522  4294967295.0;
523  return AssignRegFloat::act(fMultiplier);
524  }
525 
526  static
527  IPSDK_FORCEINLINE
529  computeRangeOffset(ipReal32 fMin, ipReal32 fMax)
530  {
532  return AssignRegFloat::act(static_cast<FloatType>(fMin));
533  }
534 
535  // updates seed
536  static
537  IPSDK_FORCEINLINE
538  void
540  {
542  }
543 
544  static
545  IPSDK_FORCEINLINE
547  act(const RegType<eInstructionSet::eIS_Avx512, FloatType>::Type& rRangeMultiplier,
550  {
557 
558  const RegInt32 rRandVal =
560  const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
561 
562  RegFloat rRandValF1 =
563  _mm512_cvtepi32_pd(_mm512_castsi512_si256(rRandVal));
564  rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
565  rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
566  rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
567 
568  const __m512i duplicateHiMask = _mm512_setr_epi32(
569  8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
570 
571  RegFloat rRandValF2 = _mm512_cvtepi32_pd(
572  _mm512_castsi512_si256(_mm512_permutex2var_epi32(
573  rRandVal, duplicateHiMask, rRandVal)));
574  rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
575  rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
576  rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
577 
578  RegFloat32 rRet = _mm512_shuffle_f32x4(
579  _mm512_castps256_ps512(_mm512_cvtpd_ps(rRandValF1)),
580  _mm512_castps256_ps512(_mm512_cvtpd_ps(rRandValF2)),
581  0x44);
582  return rRet;
583  }
584 };
585 
588 
589 } // end of namespace detail
590 } // end of namespace simd
591 } // end of namespace ipsdk
592 
593 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_UNIFORMRANDOMLCGREG_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
Definition: SubReg.h:39
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
template structure which is specialized to implement the computation of value rounded to closest even...
Definition: FloorReg.h:36
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
(including fundation and byte and word instructions)
Definition: InstructionSetTypes.h:51
Definition: NumericLimits.h:27
Definition: CastReg.h:30
RegType class.
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition: UniformRandomLCGReg.h:29
Definition of import/export macro for library.
template structure which is specialized to implement the arithmetic addition on 2 scalars or 2 regist...
Definition: AddReg.h:37
Definition: MulReg.h:39
Definition: RegType.h:29
Definition: SignedUnsignedOpReg.h:51
Definition: BitwiseAndReg.h:30
Definition: AssignRegDecl.h:31
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53