15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX2_UNIFORMRANDOMLCGREG_H__ 16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX2_UNIFORMRANDOMLCGREG_H__ 21 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/AddReg.h> 24 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/SignedUnsignedOpReg.h> 31 #include <boost/type_traits/make_unsigned.hpp> 52 return _mm256_set_epi32(seed, seed+1, seed, seed+1,
53 seed, seed+1, seed, seed+1);
64 const FloatType fMultiplier =
65 (
static_cast<FloatType
>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
67 return AssignRegFloat::act(fMultiplier);
76 return AssignRegFloat::act(static_cast<FloatType>(nMin));
85 const ipUInt32 mult[8] = { 214013, 17405, 214013, 69069, 1664525, 22695477, 1103515245, 134775813 };
86 const ipUInt32 gadd[8] = { 2531011, 10395331, 13737667, 1, 1013904223, 1, 12345, 1 };
90 const __m256i multiplier = _mm256_loadu_si256((__m256i*) mult);
91 const __m256i adder = _mm256_loadu_si256((__m256i*) gadd);
108 const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
109 const RegInt32 rMask3 = AssignRegInt32::act(3);
112 RegInt32 r1 = _mm256_srli_epi32(rSeed, 16);
113 r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
116 RegInt32 r2 = _mm256_srli_epi32(rSeed, 16);
117 r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
120 RegInt32 r3 = _mm256_srli_epi32(rSeed, 16);
121 r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
123 RegInt32 rRandVal = AddRegInt32::act(_mm256_slli_epi32(r1, 17),
124 _mm256_slli_epi32(r2, 2));
125 rRandVal = AddRegInt32::act(rRandVal,
126 BitwiseAndRegInt32::act(r3, rMask3));
143 RegInt32 rRandVal = computeRandom32bits(rSeed);
144 const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
146 RegFloat rRandValF1 =
147 _mm256_cvtepi32_pd(_mm256_extractf128_si256(rRandVal, 0));
148 rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
149 rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
151 rRandValF1 = _mm256_floor_pd(rRandValF1);
152 rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
154 RegFloat rRandValF2 = _mm256_cvtepi32_pd(
155 _mm256_extractf128_si256(rRandVal, 1));
156 rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
157 rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
159 rRandValF2 = _mm256_floor_pd(rRandValF2);
160 rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
163 RegInt32 rRet = _mm256_castsi128_si256(_mm256_cvtpd_epi32(rRandValF1));
164 rRet = _mm256_inserti128_si256(
165 rRet, _mm256_cvtpd_epi32(rRandValF2), 1);
194 const FloatType fMultiplier =
195 (
static_cast<FloatType
>(nMax) - static_cast<FloatType>(nMin) + 1.0) /
197 return AssignRegFloat::act(fMultiplier);
206 return AssignRegFloat::act(static_cast<FloatType>(nMin) - 2147483648.0);
229 const RegInt32 rRandInt32 =
235 const RegInt32 rMinInt32 =
238 const RegUInt32 rRet =
249 template <
typename T>
251 typename boost::enable_if_c<
252 boost::is_integral<T>::value
269 computeRangeMultiplier(T tMin, T tMax)
274 const FloatType fMultiplier =
275 (
static_cast<FloatType
>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
277 return AssignRegFloat::act(fMultiplier);
283 computeRangeOffset(T tMin, T tMax)
287 return AssignRegFloat::act(static_cast<FloatType>(tMin));
302 computeRand16BitsOnInt32Vals(
317 const RegInt32 rMask7FFF = AssignRegInt32::act(0x7FFF);
318 const RegInt32 rMask1 = AssignRegInt32::act(1);
321 RegInt32 r1 = _mm256_srli_epi32(rSeed, 16);
322 r1 = BitwiseAndRegInt32::act(r1, rMask7FFF);
325 RegInt32 r2 = _mm256_srli_epi32(rSeed, 16);
326 r2 = BitwiseAndRegInt32::act(r2, rMask7FFF);
329 RegInt32 r3 = _mm256_srli_epi32(rSeed, 16);
330 r3 = BitwiseAndRegInt32::act(r3, rMask7FFF);
332 RegFloat rRandValF = _mm256_cvtepi32_ps(
333 AddRegInt32::act(_mm256_slli_epi32(r1, 1), BitwiseAndRegInt32::act(r2, rMask1)));
335 rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
337 rRandValF = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(rRandValF));
338 rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
339 rVal1 = _mm256_cvtps_epi32(rRandValF);
341 rRandValF = _mm256_cvtepi32_ps(
343 _mm256_slli_epi32(r3, 1),
344 BitwiseAndRegInt32::act(_mm256_srli_epi32(r2, 1), rMask1)));
345 rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
347 rRandValF = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(rRandValF));
348 rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
349 rVal2 = _mm256_cvtps_epi32(rRandValF);
362 RegInt32 rRandVal1, rRandVal2;
363 computeRand16BitsOnInt32Vals(
379 template <
typename T>
381 typename boost::enable_if_c<
382 boost::is_integral<T>::value
399 computeRangeMultiplier(T tMin, T tMax)
405 const FloatType fMultiplier =
406 (
static_cast<FloatType
>(tMax) - static_cast<FloatType>(tMin) + 1.0f) /
408 return AssignRegFloat::act(fMultiplier);
414 computeRangeOffset(T tMin, T tMax)
418 return AssignRegFloat::act(static_cast<FloatType>(tMin));
433 computeRandomOn32BitsVal(
445 const RegInt32 rMask = AssignRegInt32::act(0x7FFF);
448 RegFloat rRandValF = _mm256_cvtepi32_ps(
449 BitwiseAndRegInt32::act(_mm256_srli_epi32(rSeed, 16), rMask));
450 rRandValF = MulRegFloat::act(rRandValF, rRangeMultiplier);
451 rRandValF = _mm256_floor_ps(rRandValF);
452 rRandValF = AddRegFloat::act(rRandValF, rRangeOffset);
453 return _mm256_cvtps_epi32(rRandValF);
474 const RegInt32 rRandVal1 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
475 const RegInt32 rRandVal2 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
476 const RegInt32 rRandVal3 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
477 const RegInt32 rRandVal4 = computeRandomOn32BitsVal(rRangeMultiplier, rRangeOffset, rSeed);
481 _mm256_packs_epi32(rRandVal1, rRandVal2),
482 _mm256_packs_epi32(rRandVal3, rRandVal4),
511 const FloatType fMultiplier =
512 (
static_cast<FloatType
>(fMax) - static_cast<FloatType>(fMin)) /
514 return AssignRegFloat::act(fMultiplier);
523 return AssignRegFloat::act(static_cast<FloatType>(fMin));
549 const RegInt32 rRandVal =
551 const RegFloat rUIntHlfRangeF = AssignRegFloat::act(2147483648.0);
553 RegFloat rRandValF1 =
554 _mm256_cvtepi32_pd(_mm256_extractf128_si256(rRandVal, 0));
555 rRandValF1 = AddRegFloat::act(rRandValF1, rUIntHlfRangeF);
556 rRandValF1 = MulRegFloat::act(rRandValF1, rRangeMultiplier);
557 rRandValF1 = AddRegFloat::act(rRandValF1, rRangeOffset);
559 RegFloat rRandValF2 = _mm256_cvtepi32_pd(
560 _mm256_extractf128_si256(rRandVal, 1));
561 rRandValF2 = AddRegFloat::act(rRandValF2, rUIntHlfRangeF);
562 rRandValF2 = MulRegFloat::act(rRandValF2, rRangeMultiplier);
563 rRandValF2 = AddRegFloat::act(rRandValF2, rRangeOffset);
565 RegFloat32 rRet = _mm256_castps128_ps256(_mm256_cvtpd_ps(rRandValF1));
566 rRet = _mm256_insertf128_ps(
567 rRet, _mm256_cvtpd_ps(rRandValF2), 1);
579 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_STD_ABSPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
int32_t ipInt32
Base types definition.
Definition: BaseTypes.h:52
Definition: NumericLimits.h:27
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Advanced Vector Extensions 2.
Definition: InstructionSetTypes.h:48
Definition: UniformRandomLCGReg.h:29
Definition of import/export macro for library.
template structure which is specialized to implement the arithmetic addition on 2 scalars or 2 regist...
Definition: AddReg.h:37
Definition: SignedUnsignedOpReg.h:51
Definition: BitwiseAndReg.h:30
Definition: AssignRegDecl.h:31
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53