IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
GatherRegImpl.h
1 // GatherRegImpl.h:
3 // ------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX512_GATHERREGIMPL_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX512_GATHERREGIMPL_H__
17 
23 
24 #include <boost/mpl/and.hpp>
25 #include <boost/type_traits/is_same.hpp>
26 #include <boost/type_traits/is_signed.hpp>
27 
28 namespace ipsdk {
29 namespace simd {
30 namespace detail {
31 
34 
35 template <typename T>
36 IPSDK_FORCEINLINE
37 void
38 GatherReg<eInstructionSet::eIS_Avx512, T,
39  typename boost::enable_if<
40  typename boost::mpl::and_<typename boost::is_integral<T>::type,
41  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<1> >::type
42  >::type
43  >::type
44 >
45 ::act(const T* baseAddress,
46  const ipUInt32* indexes,
47  typename Avx512Type<T>::Type& out)
48 {
49  T contiguous[64];
50  for(ipUInt32 i=0; i<64; ++i) {
51  contiguous[i] = *(baseAddress + indexes[i]);
52  }
53  LoadReg<eInstructionSet::eIS_Avx512, T>::act(out, contiguous);
54 }
55 
56 template <typename T>
57 IPSDK_FORCEINLINE
58 void
59 GatherReg<eInstructionSet::eIS_Avx512, T,
60  typename boost::enable_if<
61  typename boost::mpl::and_<typename boost::is_integral<T>::type,
62  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<1> >::type
63  >::type
64  >::type
65 >
66 ::act(const T* baseAddress,
67  const Avx512Type<ipUInt32>::Type& regIdx1,
68  const Avx512Type<ipUInt32>::Type& regIdx2,
69  const Avx512Type<ipUInt32>::Type& regIdx3,
70  const Avx512Type<ipUInt32>::Type& regIdx4,
71  typename Avx512Type<T>::Type& out)
72 {
73  ipUInt32 indexes[64];
74  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx1, indexes);
75  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx2, indexes+16);
76  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx3, indexes+32);
77  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx4, indexes+48);
78 
79  T contiguous[64];
80  for(ipUInt32 i=0; i<64; ++i) {
81  contiguous[i] = *(baseAddress + indexes[i]);
82  }
83  LoadReg<eInstructionSet::eIS_Avx512, T>::act(out, contiguous);
84 }
85 
86 template <typename T>
87 IPSDK_FORCEINLINE
88 void
89 GatherReg<eInstructionSet::eIS_Avx512, T,
90  typename boost::enable_if<
91  typename boost::mpl::and_<typename boost::is_integral<T>::type,
92  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<2> >::type
93  >::type
94  >::type
95 >
96 ::act(const T* baseAddress,
97  const ipUInt32* indexes,
98  typename Avx512Type<T>::Type& out)
99 {
100  T contiguous[32];
101  for(ipUInt32 i=0; i<32; ++i) {
102  contiguous[i] = *(baseAddress + indexes[i]);
103  }
104  LoadReg<eInstructionSet::eIS_Avx512, T>::act(out, contiguous);
105 }
106 
107 template <typename T>
108 IPSDK_FORCEINLINE
109 void
110 GatherReg<eInstructionSet::eIS_Avx512, T,
111  typename boost::enable_if<
112  typename boost::mpl::and_<typename boost::is_integral<T>::type,
113  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<2> >::type
114  >::type
115  >::type
116 >
117 ::act(const T* baseAddress,
118  const Avx512Type<ipUInt32>::Type& regIdx1,
119  const Avx512Type<ipUInt32>::Type& regIdx2,
120  typename Avx512Type<T>::Type& out)
121 {
122  ipUInt32 indexes[32];
123  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx1, indexes);
124  UnloadReg<eInstructionSet::eIS_Avx512, ipUInt32>::act(regIdx2, indexes+16);
125 
126  T contiguous[32];
127  for(ipUInt32 i=0; i<32; ++i) {
128  contiguous[i] = *(baseAddress + indexes[i]);
129  }
130  LoadReg<eInstructionSet::eIS_Avx512, T>::act(out, contiguous);
131 }
132 
133 template <typename T>
134 IPSDK_FORCEINLINE
135 void
136 GatherReg<eInstructionSet::eIS_Avx512, T,
137  typename boost::enable_if<
138  typename boost::mpl::and_<typename boost::is_integral<T>::type,
139  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<4> >::type
140  >::type
141  >::type
142 >
143 ::act(const T* baseAddress, const ipUInt32* indexes,
144  typename Avx512Type<T>::Type& out)
145 {
146  out = _mm512_set_epi32(
147  baseAddress[indexes[15]],
148  baseAddress[indexes[14]],
149  baseAddress[indexes[13]],
150  baseAddress[indexes[12]],
151  baseAddress[indexes[11]],
152  baseAddress[indexes[10]],
153  baseAddress[indexes[9]],
154  baseAddress[indexes[8]],
155  baseAddress[indexes[7]],
156  baseAddress[indexes[6]],
157  baseAddress[indexes[5]],
158  baseAddress[indexes[4]],
159  baseAddress[indexes[3]],
160  baseAddress[indexes[2]],
161  baseAddress[indexes[1]],
162  baseAddress[indexes[0]]);
163 }
164 
165 template <typename T>
166 IPSDK_FORCEINLINE
167 void
168 GatherReg<eInstructionSet::eIS_Avx512, T,
169  typename boost::enable_if<
170  typename boost::mpl::and_<typename boost::is_integral<T>::type,
171  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<4> >::type
172  >::type
173  >::type
174 >
175 ::act(const T* baseAddress, const Avx512Type<ipUInt32>::Type& regIdx,
176  typename Avx512Type<T>::Type& out)
177 {
178  const int* baseAddrCvt = reinterpret_cast<const int*>(baseAddress);
179  const Avx512Type<ipUInt32>::Type regIdxLocal = regIdx;
180  out = _mm512_i32gather_epi32(regIdxLocal, baseAddrCvt, 4);
181 }
182 
183 IPSDK_FORCEINLINE
184 void
185 GatherReg<eInstructionSet::eIS_Avx512, ipReal32>::act(
186  const ipReal32* baseAddress, const ipUInt32* indexes,
187  Avx512Type<ipReal32>::Type& out)
188 {
189  out = _mm512_set_ps(
190  baseAddress[indexes[15]],
191  baseAddress[indexes[14]],
192  baseAddress[indexes[13]],
193  baseAddress[indexes[12]],
194  baseAddress[indexes[11]],
195  baseAddress[indexes[10]],
196  baseAddress[indexes[9]],
197  baseAddress[indexes[8]],
198  baseAddress[indexes[7]],
199  baseAddress[indexes[6]],
200  baseAddress[indexes[5]],
201  baseAddress[indexes[4]],
202  baseAddress[indexes[3]],
203  baseAddress[indexes[2]],
204  baseAddress[indexes[1]],
205  baseAddress[indexes[0]]);
206 }
207 
208 // gather implementation for Avx2 for real32 type
209 IPSDK_FORCEINLINE
210 void
211 GatherReg<eInstructionSet::eIS_Avx512, ipReal32>::act(
212  const ipReal32* baseAddress, const Avx512Type<ipUInt32>::Type& regIdx,
213  Avx512Type<ipReal32>::Type& out)
214 {
215  const Avx512Type<ipUInt32>::Type regIdxLocal = regIdx;
216  out = _mm512_i32gather_ps(regIdxLocal, baseAddress, 4);
217 }
218 
219 IPSDK_FORCEINLINE
220 void
221 GatherReg<eInstructionSet::eIS_Avx512, ipReal64>::act(
222  const ipReal64* baseAddress, const ipUInt32* indexes,
223  Avx512Type<ipReal64>::Type& out)
224 {
225  out = _mm512_set_pd(
226  baseAddress[indexes[7]],
227  baseAddress[indexes[6]],
228  baseAddress[indexes[5]],
229  baseAddress[indexes[4]],
230  baseAddress[indexes[3]],
231  baseAddress[indexes[2]],
232  baseAddress[indexes[1]],
233  baseAddress[indexes[0]]);
234 }
235 
236 // gather implementation for Avx2 for real32 type
237 IPSDK_FORCEINLINE
238 void
239 GatherReg<eInstructionSet::eIS_Avx512, ipReal64>::act(
240  const ipReal64* baseAddress, const __m256i& regIdx,
241  Avx512Type<ipReal64>::Type& out)
242 {
243  const __m256i regIdxLocal = regIdx;
244  out = _mm512_i32gather_pd(regIdxLocal, baseAddress, 8);
245 }
246 
249 
250 } // end of namespace detail
251 } // end of namespace simd
252 } // end of namespace ipsdk
253 
254 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_GATHERREGIMPL_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
(including fundation and byte and word instructions)
Definition: InstructionSetTypes.h:51
Definition of import/export macro for library.
unload function; unloads a pack into a memory buffer
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53