IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
ScanPack.h
Go to the documentation of this file.
1 // ScanPack.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__
17 
20 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/ScanReg.h>
25 
26 namespace ipsdk {
27 namespace simd {
28 namespace detail {
29 
32 
35 template <>
37 {
38  static IPSDK_FORCEINLINE
41  {
43  out._val[0] = ScanReg<eInstructionSet::eIS_Sse2, ipReal32>::act(in._val[0]);
44 
45  __m128 offset = _mm_shuffle_ps(out._val[0], out._val[0], _MM_SHUFFLE(3, 3, 3, 3));
46 
47  out._val[1] = ScanReg<eInstructionSet::eIS_Sse2, ipReal32>::act(in._val[1]);
48  out._val[1] = _mm_add_ps(out._val[1], offset);
49  offset = _mm_shuffle_ps(out._val[1], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
50 
51  out._val[2] = ScanReg<eInstructionSet::eIS_Sse2, ipReal32>::act(in._val[2]);
52  out._val[2] = _mm_add_ps(out._val[2], offset);
53  offset = _mm_shuffle_ps(out._val[2], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
54 
55  out._val[3] = ScanReg<eInstructionSet::eIS_Sse2, ipReal32>::act(in._val[2]);
56  out._val[3] = _mm_add_ps(out._val[2], offset);
57  offset = _mm_shuffle_ps(out._val[3], out._val[3], _MM_SHUFFLE(3, 3, 3, 3));
58 
59  return out;
60  }
61 
62  static IPSDK_FORCEINLINE
63  void
66  {
68 
69  __m128 offset = _mm_shuffle_ps(out._val[0], out._val[0], _MM_SHUFFLE(3, 3, 3, 3));
70 
72  out._val[1] = _mm_add_ps(out._val[1], offset);
73  offset = _mm_shuffle_ps(out._val[1], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
74 
76  out._val[2] = _mm_add_ps(out._val[2], offset);
77  offset = _mm_shuffle_ps(out._val[2], out._val[1], _MM_SHUFFLE(3, 3, 3, 3));
78 
80  out._val[3] = _mm_add_ps(out._val[2], offset);
81  offset = _mm_shuffle_ps(out._val[3], out._val[3], _MM_SHUFFLE(3, 3, 3, 3));
82  }
83 };
84 
85 template <>
87 {
88  static IPSDK_FORCEINLINE
89  void act(const ipReal32* in, ipReal32* out, ipUInt64 nbElts)
90  {
91  __m128 regOffset = _mm_setzero_ps();
92 
93  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal32>::Value);
94 
95  ipUInt64 i=0;
96 
98 
99  for(; i < nbAlignedElts; i+=simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal32>::Value) {
100 
103  regOut = _mm_add_ps(regOut, regOffset);
105  regOffset = _mm_shuffle_ps(regOut, regOut, _MM_SHUFFLE(3, 3, 3, 3));
106  }
107  for (; i < nbElts; ++i) {
108 
109  // update of mean
110  out[i] = out[i - 1] + in[i];
111  }
112  }
113 };
114 
115 template <>
117 {
118  static IPSDK_FORCEINLINE
119  void act(const ipReal64* in, ipReal64* out, ipUInt64 nbElts)
120  {
121  __m128d regOffset = _mm_setzero_pd();
122 
123  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal64>::Value);
124 
125  ipUInt64 i = 0;
126 
128 
129  for (; i < nbAlignedElts; i += simd::NbEltsPerReg<ePackType::ePT_Sse, ipReal64>::Value) {
130 
133  regOut = _mm_add_pd(regOut, regOffset);
135  regOffset = _mm_shuffle_pd(regOut, regOut, _MM_SHUFFLE2(1, 1));
136  }
137  for (; i < nbElts; ++i) {
138 
139  // update of mean
140  out[i] = out[i - 1] + in[i];
141  }
142  }
143 };
144 
147 
148 } // end of namespace detail
149 } // end of namespace simd
150 } // end of namespace ipsdk
151 
152 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_SSE2_SCANPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
template structure which is specialized to implement the cumulative sum on 2 registers, depending on the used instructionSet and on the types of the buffers loaded in the registers
Definition: ScanReg.h:36
uint64_t ipUInt64
Base types definition.
Definition: BaseTypes.h:55
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
Definition: NbEltsPerReg.h:28
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition: ScanPack.h:35
Definition of import/export macro for library.
structure containing intrinsic registers used to store vectorized data
Definition: BaseReg.h:29
Streaming SIMD Extensions 2.
Definition: InstructionSetTypes.h:36
Definition: UnloadReg.h:30
Definition: LoadRegDecl.h:30
Definition: ScanPack.h:30
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
structure containing intrinsic registers used to store vectorized data
Definition: BasePackDecl.h:29