IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
ScanPack.h
Go to the documentation of this file.
1 // ScanPack.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANPACK_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANPACK_H__
17 
25 
26 namespace ipsdk {
27 namespace simd {
28 namespace detail {
29 
32 
35 template <>
37 {
38  static IPSDK_FORCEINLINE
41  {
43  }
44 
45  static IPSDK_FORCEINLINE
46  void
49  {
51  }
52 };
53 
54 template <>
56 {
57  static IPSDK_FORCEINLINE
58  void act(const ipReal32* in, ipReal32* out, ipUInt64 nbElts)
59  {
60  __m512 regOffset = _mm512_setzero_ps();
61 
62  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Avx512, ipReal32>::Value);
63 
64  ipUInt64 i = 0;
65 
67 
68  for (; i < nbAlignedElts; i += simd::NbEltsPerReg<ePackType::ePT_Avx512, ipReal32>::Value) {
69 
72 
73  regOut = _mm512_add_ps(regOut, regOffset);
75  regOffset = _mm512_permutex2var_ps(regOut, _mm512_set1_epi32(15), regOut);
76  }
77  for (; i < nbElts; ++i) {
78 
79  // update of mean
80  out[i] = out[i - 1] + in[i];
81  }
82  }
83 };
84 
85 template <>
87 {
88  static IPSDK_FORCEINLINE
89  void act(const ipReal64* in, ipReal64* out, ipUInt64 nbElts)
90  {
91  __m512d regOffset = _mm512_setzero_pd();
92 
93  const ipUInt64 nbAlignedElts = nbElts - (nbElts%simd::NbEltsPerReg<ePackType::ePT_Avx512, ipReal64>::Value);
94 
95  ipUInt64 i = 0;
96 
98 
99  for (; i < nbAlignedElts; i += simd::NbEltsPerReg<ePackType::ePT_Avx512, ipReal64>::Value) {
100 
103 
104  regOut = _mm512_add_pd(regOut, regOffset);
106  regOffset = _mm512_permutex2var_pd(regOut, _mm512_set1_epi64(63), regOut);
107  }
108  for (; i < nbElts; ++i) {
109 
110  // update of mean
111  out[i] = out[i - 1] + in[i];
112  }
113  }
114 };
117 
118 } // end of namespace detail
119 } // end of namespace simd
120 } // end of namespace ipsdk
121 
122 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANPACK_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
template structure which is specialized to implement the cumulative sum on 2 registers, depending on the used instructionSet and on the types of the buffers loaded in the registers
Definition: ScanReg.h:36
uint64_t ipUInt64
Base types definition.
Definition: BaseTypes.h:55
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
(including fundation and byte and word instructions)
Definition: InstructionSetTypes.h:51
Definition: NbEltsPerReg.h:28
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Definition: ScanPack.h:35
Definition of import/export macro for library.
structure containing intrinsic registers used to store vectorized data
Definition: BaseReg.h:29
Definition: UnloadReg.h:30
Definition: LoadRegDecl.h:30
Definition: ScanPack.h:30
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
structure containing intrinsic registers used to store vectorized data
Definition: BasePackDecl.h:29