IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
ScanPack.h
Go to the documentation of this file.
1 // ScanPack.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_SCANPACK_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_SCANPACK_H__
17 
20 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/ScanReg.h>
26 
27 namespace ipsdk {
28 namespace simd {
29 namespace detail {
30 
33 
36 template <>
38 {
39  static IPSDK_FORCEINLINE
42  {
44  act(in, out);
45  return out;
46  }
47 
48  static IPSDK_FORCEINLINE
49  void
52  {
53  out._val[0] = ScanReg<eInstructionSet::eIS_Avx, ipReal32>::act(in._val[0]);
54  __m256 t0 = _mm256_permute2f128_ps(out._val[0], out._val[0], 0x11);
55  __m256 offset = _mm256_permute_ps(t0, 0xff);
56 
57  out._val[1] = ScanReg<eInstructionSet::eIS_Avx, ipReal32>::act(in._val[1]);
58  out._val[1] = _mm256_add_ps(out._val[1], offset);
59  t0 = _mm256_permute2f128_ps(out._val[1], out._val[1], 0x11);
60  offset = _mm256_permute_ps(t0, 0xff);
61 
62  out._val[2] = ScanReg<eInstructionSet::eIS_Avx, ipReal32>::act(in._val[2]);
63  out._val[2] = _mm256_add_ps(out._val[2], offset);
64  t0 = _mm256_permute2f128_ps(out._val[2], out._val[2], 0x11);
65  offset = _mm256_permute_ps(t0, 0xff);
66 
67  out._val[3] = ScanReg<eInstructionSet::eIS_Avx, ipReal32>::act(in._val[2]);
68  out._val[3] = _mm256_add_ps(out._val[2], offset);
69  }
70 };
71 
72 template <>
74 {
75  static IPSDK_FORCEINLINE
76  void act(const ipReal32* in, ipReal32* out, ipUInt64 nbElts)
77  {
78  __m256 regOffset = _mm256_setzero_ps();
79 
80  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Avx, ipReal32>::Value);
81 
82  ipUInt64 i=0;
83 
85 
86  for(; i < nbAlignedElts; i+=simd::NbEltsPerReg<ePackType::ePT_Avx, ipReal32>::Value) {
87 
90 
91  regOut = _mm256_add_ps(regOut, regOffset);
93  __m256 t0 = _mm256_permute2f128_ps(regOut, regOut, 0x11);
94  regOffset = _mm256_permute_ps(t0, 0xff);
95  }
96  for (; i < nbElts; ++i) {
97 
98  // update of mean
99  out[i] = out[i - 1] + in[i];
100  }
101  }
102 };
103 
104 template <>
106 {
107  static IPSDK_FORCEINLINE
108  void act(const ipReal64* in, ipReal64* out, ipUInt64 nbElts)
109  {
110  __m256d regOffset = _mm256_setzero_pd();
111 
112  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Avx, ipReal64>::Value);
113 
114  ipUInt64 i = 0;
115 
117 
118  for (; i < nbAlignedElts; i += simd::NbEltsPerReg<ePackType::ePT_Avx, ipReal64>::Value) {
119 
122 
123  regOut = _mm256_add_pd(regOut, regOffset);
125 
127  _mm256_permute2f128_pd(regOut, regOut, 0x11);
128  regOffset = _mm256_shuffle_pd(regOut2, regOut2, 0xF);
129  }
130  for (; i < nbElts; ++i) {
131 
132  // update of mean
133  out[i] = out[i - 1] + in[i];
134  }
135  }
136 };
139 
140 } // end of namespace detail
141 } // end of namespace simd
142 } // end of namespace ipsdk
143 
144 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX_SCANPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
template structure which is specialized to implement the cumulative sum on 2 registers, depending on the used instructionSet and on the types of the buffers loaded in the registers
Definition: ScanReg.h:36
uint64_t ipUInt64
Base types definition.
Definition: BaseTypes.h:55
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
Definition: NbEltsPerReg.h:28
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Advanced Vector Extensions.
Definition: InstructionSetTypes.h:44
Definition: ScanPack.h:35
Definition of import/export macro for library.
structure containing intrinsic registers used to store vectorized data
Definition: BaseReg.h:29
Definition: UnloadReg.h:30
Definition: LoadRegDecl.h:30
Definition: ScanPack.h:30
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
structure containing intrinsic registers used to store vectorized data
Definition: BasePackDecl.h:29