IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
GatherRegImpl.h
1 // GatherRegImpl.h:
3 // ------------
4 //
14 
15 #ifndef __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_GATHERREGIMPL_H__
16 #define __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_GATHERREGIMPL_H__
17 
23 
24 #include <boost/mpl/and.hpp>
25 #include <boost/type_traits/is_same.hpp>
26 #include <boost/type_traits/is_signed.hpp>
27 
28 namespace ipsdk {
29 namespace simd {
30 namespace detail {
31 
34 
35 template <typename T>
36 IPSDK_FORCEINLINE
37 void
38 GatherReg<eInstructionSet::eIS_Avx2, T,
39  typename boost::enable_if<
40  typename boost::mpl::and_<typename boost::is_integral<T>::type,
41  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<1> >::type
42  >::type
43  >::type
44 >
45 ::act(const T* baseAddress,
46  const ipUInt32* indexes,
47  typename AvxType<T>::Type& out)
48 {
49  T contiguous[32];
50  for(ipUInt32 i=0; i<32; ++i) {
51  contiguous[i] = *(baseAddress + indexes[i]);
52  }
53  LoadReg<eInstructionSet::eIS_Avx2, T>::act(out, contiguous);
54 }
55 
56 template <typename T>
57 IPSDK_FORCEINLINE
58 void
59 GatherReg<eInstructionSet::eIS_Avx2, T,
60  typename boost::enable_if<
61  typename boost::mpl::and_<typename boost::is_integral<T>::type,
62  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<1> >::type
63  >::type
64  >::type
65 >
66 ::act(const T* baseAddress,
67  const AvxType<ipUInt32>::Type& regIdx1,
68  const AvxType<ipUInt32>::Type& regIdx2,
69  const AvxType<ipUInt32>::Type& regIdx3,
70  const AvxType<ipUInt32>::Type& regIdx4,
71  typename AvxType<T>::Type& out)
72 {
73  ipUInt32 indexes[32];
74  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx1, indexes);
75  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx2, indexes+8);
76  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx3, indexes+16);
77  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx4, indexes+24);
78 
79  T contiguous[32];
80  for(ipUInt32 i=0; i<32; ++i) {
81  contiguous[i] = *(baseAddress + indexes[i]);
82  }
83  LoadReg<eInstructionSet::eIS_Avx2, T>::act(out, contiguous);
84 }
85 
86 template <typename T>
87 IPSDK_FORCEINLINE
88 void
89 GatherReg<eInstructionSet::eIS_Avx2, T,
90  typename boost::enable_if<
91  typename boost::mpl::and_<typename boost::is_integral<T>::type,
92  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<2> >::type
93  >::type
94  >::type
95 >
96 ::act(const T* baseAddress,
97  const ipUInt32* indexes,
98  typename AvxType<T>::Type& out)
99 {
100  T contiguous[16];
101  for(ipUInt32 i=0; i<16; ++i) {
102  contiguous[i] = *(baseAddress + indexes[i]);
103  }
104  LoadReg<eInstructionSet::eIS_Avx2, T>::act(out, contiguous);
105 }
106 
107 template <typename T>
108 IPSDK_FORCEINLINE
109 void
110 GatherReg<eInstructionSet::eIS_Avx2, T,
111  typename boost::enable_if<
112  typename boost::mpl::and_<typename boost::is_integral<T>::type,
113  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<2> >::type
114  >::type
115  >::type
116 >
117 ::act(const T* baseAddress,
118  const AvxType<ipUInt32>::Type& regIdx1,
119  const AvxType<ipUInt32>::Type& regIdx2,
120  typename AvxType<T>::Type& out)
121 {
122  ipUInt32 indexes[16];
123  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx1, indexes);
124  UnloadReg<eInstructionSet::eIS_Avx2, ipUInt32>::act(regIdx2, indexes+8);
125 
126  T contiguous[16];
127  for(ipUInt32 i=0; i<16; ++i) {
128  contiguous[i] = *(baseAddress + indexes[i]);
129  }
130  LoadReg<eInstructionSet::eIS_Avx2, T>::act(out, contiguous);
131 }
132 
133 template <typename T>
134 IPSDK_FORCEINLINE
135 void
136 GatherReg<eInstructionSet::eIS_Avx2, T,
137  typename boost::enable_if<
138  typename boost::mpl::and_<typename boost::is_integral<T>::type,
139  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<4> >::type
140  >::type
141  >::type
142 >
143 ::act(const T* baseAddress, const ipUInt32* indexes,
144  typename AvxType<T>::Type& out)
145 {
146  out = _mm256_set_epi32(
147  baseAddress[indexes[7]],
148  baseAddress[indexes[6]],
149  baseAddress[indexes[5]],
150  baseAddress[indexes[4]],
151  baseAddress[indexes[3]],
152  baseAddress[indexes[2]],
153  baseAddress[indexes[1]],
154  baseAddress[indexes[0]]);
155 }
156 
157 template <typename T>
158 IPSDK_FORCEINLINE
159 void
160 GatherReg<eInstructionSet::eIS_Avx2, T,
161  typename boost::enable_if<
162  typename boost::mpl::and_<typename boost::is_integral<T>::type,
163  typename boost::mpl::equal_to<boost::mpl::int_<sizeof(T)>, boost::mpl::int_<4> >::type
164  >::type
165  >::type
166 >
167 ::act(const T* baseAddress, const AvxType<ipUInt32>::Type& regIdx,
168  typename AvxType<T>::Type& out)
169 {
170  const int* baseAddrCvt = reinterpret_cast<const int*>(baseAddress);
171  const AvxType<ipUInt32>::Type regIdxLocal = regIdx;
172  out = _mm256_i32gather_epi32(baseAddrCvt, regIdxLocal, 4);
173 }
174 
175 IPSDK_FORCEINLINE
176 void
177 GatherReg<eInstructionSet::eIS_Avx2, ipReal32>::act(
178  const ipReal32* baseAddress, const ipUInt32* indexes,
179  AvxType<ipReal32>::Type& out)
180 {
181  out = _mm256_set_ps(
182  baseAddress[indexes[7]],
183  baseAddress[indexes[6]],
184  baseAddress[indexes[5]],
185  baseAddress[indexes[4]],
186  baseAddress[indexes[3]],
187  baseAddress[indexes[2]],
188  baseAddress[indexes[1]],
189  baseAddress[indexes[0]]);
190 }
191 
192 // gather implementation for Avx2 for real32 type
193 IPSDK_FORCEINLINE
194 void
195 GatherReg<eInstructionSet::eIS_Avx2, ipReal32>::act(
196  const ipReal32* baseAddress, const AvxType<ipUInt32>::Type& regIdx,
197  AvxType<ipReal32>::Type& out)
198 {
199  const AvxType<ipUInt32>::Type regIdxLocal = regIdx;
200  out = _mm256_i32gather_ps(baseAddress, regIdxLocal, 4);
201 }
202 
203 IPSDK_FORCEINLINE
204 void
205 GatherReg<eInstructionSet::eIS_Avx2, ipReal64>::act(
206  const ipReal64* baseAddress, const ipUInt32* indexes,
207  AvxType<ipReal64>::Type& out)
208 {
209  out = _mm256_set_pd(
210  baseAddress[indexes[3]],
211  baseAddress[indexes[2]],
212  baseAddress[indexes[1]],
213  baseAddress[indexes[0]]);
214 }
215 
216 // gather implementation for Avx2 for real32 type
217 IPSDK_FORCEINLINE
218 void
219 GatherReg<eInstructionSet::eIS_Avx2, ipReal64>::act(
220  const ipReal64* baseAddress, const __m128i& regIdx,
221  AvxType<ipReal64>::Type& out)
222 {
223  const __m128i regIdxLocal = regIdx;
224  out = _mm256_i32gather_pd(baseAddress, regIdxLocal, 8);
225 }
226 
229 
230 } // end of namespace detail
231 } // end of namespace simd
232 } // end of namespace ipsdk
233 
234 #endif // __IPSDKUTIL_INSTRUCTIONSET_DETAIL_AVX2_GATHERREGIMPL_H__
Defines the IPSDK_FORCEINLINE.
Main namespace for IPSDK library.
Definition: AlgorithmFunctionEfficiency.h:22
double ipReal64
Base types definition.
Definition: BaseTypes.h:57
Advanced Vector Extensions 2.
Definition: InstructionSetTypes.h:48
Definition of import/export macro for library.
unload function; unloads a pack into a memory buffer
float ipReal32
Base types definition.
Definition: BaseTypes.h:56
uint32_t ipUInt32
Base types definition.
Definition: BaseTypes.h:53