15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__ 16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__ 20 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/ScanReg.h> 38 static IPSDK_FORCEINLINE
45 __m128 offset = _mm_shuffle_ps(out._val[0], out._val[0], _MM_SHUFFLE(3, 3, 3, 3));
48 out._val[1] = _mm_add_ps(out._val[1], offset);
49 offset = _mm_shuffle_ps(out._val[1], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
52 out._val[2] = _mm_add_ps(out._val[2], offset);
53 offset = _mm_shuffle_ps(out._val[2], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
56 out._val[3] = _mm_add_ps(out._val[2], offset);
57 offset = _mm_shuffle_ps(out._val[3], out._val[3], _MM_SHUFFLE(3, 3, 3, 3));
62 static IPSDK_FORCEINLINE
69 __m128 offset = _mm_shuffle_ps(out._val[0], out._val[0], _MM_SHUFFLE(3, 3, 3, 3));
72 out._val[1] = _mm_add_ps(out._val[1], offset);
73 offset = _mm_shuffle_ps(out._val[1], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
76 out._val[2] = _mm_add_ps(out._val[2], offset);
77 offset = _mm_shuffle_ps(out._val[2], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
80 out._val[3] = _mm_add_ps(out._val[2], offset);
81 offset = _mm_shuffle_ps(out._val[3], out._val[3], _MM_SHUFFLE(3, 3, 3, 3));
88 static IPSDK_FORCEINLINE
91 __m128 regOffset = _mm_setzero_ps();
99 for(; i < nbAlignedElts; i+=simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal32>::Value) {
103 regOut = _mm_add_ps(regOut, regOffset);
105 regOffset = _mm_shuffle_ps(regOut, regOut, _MM_SHUFFLE(3, 3, 3, 3));
107 for (; i < nbElts; ++i) {
110 out[i] = out[i - 1] + in[i];
118 static IPSDK_FORCEINLINE
121 __m128d regOffset = _mm_setzero_pd();
129 for (; i < nbAlignedElts; i += simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal64>::Value) {
133 regOut = _mm_add_pd(regOut, regOffset);
135 regOffset = _mm_shuffle_pd(regOut, regOut, _MM_SHUFFLE2(1, 1));
137 for (; i < nbElts; ++i) {
140 out[i] = out[i - 1] + in[i];
152 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
template structure which is specialized to implement the cumulative sum on 2 registers, depending on the used instructionSet and on the types of the buffers loaded in the registers
Definition: ScanReg.h:36
uint64_t ipUInt64
Base types definition.
Definition: BaseTypes.h:55
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
Definition: NbEltsPerReg.h:28
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition: ScanPack.h:35
Definition of import/export macro for library.
structure containing intrinsic registers used to store vectorized data
Definition: BaseReg.h:29
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
Definition: UnloadReg.h:30
Definition: LoadRegDecl.h:30
Definition: ScanPack.h:30
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
structure containing intrinsic registers used to store vectorized data
Definition: BasePackDecl.h:29