IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
ScanReg.h
Go to the documentation of this file.
1 // ScanReg.h:
3 // -------------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANREG_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANREG_H__
17 
20 #include <IPSDKUtil/InstructionSet/Arithmetic/detail/ScanReg.h>
22 
23 namespace ipsdk {
24 namespace simd {
25 namespace detail {
26 
27 template <>
29 {
30  static IPSDK_FORCEINLINE
32  act(const Avx512Type<ipReal32>::Type& in)
33  {
36  return out;
37  }
38 
39  static IPSDK_FORCEINLINE
40  void
41  act(const Avx512Type<ipReal32>::Type& in,
43  {
44  __m512 t0;
45  // shift 32 bits right + add
46  t0 = _mm512_maskz_permutex2var_ps(0xfffe, in, _mm512_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), in);
47  out = _mm512_add_ps(in, t0);
48 
49  // shift 32 bits right + add
50  t0 = _mm512_maskz_permutex2var_ps(0xfffc, out, _mm512_setr_epi32(0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), out);
51  out = _mm512_add_ps(out, t0);
52 
53  // shift 32 bits right + add
54  t0 = _mm512_maskz_permutex2var_ps(0xfff0, out, _mm512_setr_epi32(0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), out);
55  out = _mm512_add_ps(out, t0);
56 
57  // shift 32 bits right + add
58  t0 = _mm512_maskz_permutex2var_ps(0xff00, out, _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7), out);
59  out = _mm512_add_ps(out, t0);
60  }
61 };
62 
63 template <>
65 {
66  static IPSDK_FORCEINLINE
68  act(const Avx512Type<ipReal64>::Type& in)
69  {
72  return out;
73  }
74 
75  static IPSDK_FORCEINLINE
76  void
77  act(const Avx512Type<ipReal64>::Type& in,
79  {
80  __m512d t0;
81  // shift 64 bits right + add
82  t0 = _mm512_maskz_permutex2var_pd(0xfe, in, _mm512_setr_epi64(0, 0, 1, 2, 3, 4, 5, 6), in);
83  out = _mm512_add_pd(in, t0);
84 
85  // shift 64 bits right + add
86  t0 = _mm512_maskz_permutex2var_pd(0xfc, out, _mm512_setr_epi64(0, 0, 0, 1, 2, 3, 4, 5), out);
87  out = _mm512_add_pd(out, t0);
88 
89  // shift 64 bits right + add
90  t0 = _mm512_maskz_permutex2var_pd(0xf0, out, _mm512_setr_epi64(0, 0, 0, 0, 0, 1, 2, 3), out);
91  out = _mm512_add_pd(out, t0);
92  }
93 };
94 
97 
98 } // end of namespace detail
99 } // end of namespace simd
100 } // end of namespace ipsdk
101 
102 #endif // __IPSDKUTIL_INSTRUCTIONSET_ARITHMETIC_DETAIL_AVX512_SCANREG_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
template structure which is specialized to implement the cumulative sum on 2 registers, depending on the used instructionSet and on the types of the buffers loaded in the registers
Definition: ScanReg.h:36
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
(including fundation and byte and word instructions)
Definition: InstructionSetTypes.h:51
eInstructionSet
Enumerate for processor instruction set description.
Definition: InstructionSetTypes.h:31
Predefined types for Avx512 instruction set management.
Definition of import/export macro for library.
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
structure used to retrieve AVX512 type associated to a base type
Definition: Avx512Types.h:36