IPSDK  4_1_0_2
IPSDK : Image Processing Software Development Kit
CudaSrcMacros.h
Go to the documentation of this file.
1 // CudaSrcMacros.h:
3 // ------------------------------------
4 //
14 
15 #ifndef __IPSDKIMAGEPROCESSING_CUDASRCMACROS_H__
16 #define __IPSDKIMAGEPROCESSING_CUDASRCMACROS_H__
17 
18 // Trick to explicitly ignore a parameter
19 #define IPSDKCUDA_UNUSED_VAR(x) (void)(x)
20 
21 // Split the data from the ImageSizeInfo structure to 5 UInt32 scalars
22 // The macro input argument is a pointer to an instance of ImageSizeInfo
23 #define IPSDKCUDA_SPLIT_SIZES_FROM_STRUCT(imgSizeInfo) \
24  const unsigned long sizeX = imgSizeInfo._sizeX; \
25  const unsigned long sizeY = imgSizeInfo._sizeY; \
26  const unsigned long sizeZ = imgSizeInfo._sizeZ; \
27  const unsigned long sizeC = imgSizeInfo._sizeC; \
28  const unsigned long sizeT = imgSizeInfo._sizeT;
29 
30 // Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
31 #define IPSDKCUDA_SET_GRID() \
32  const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
33  const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
34  const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
35  const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
36  const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
37  const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
38  const unsigned long nbPlans = sizeZ * sizeC * sizeT; \
39  const unsigned long nbBlocksX = sizeX / nbThreadsPerBlockDim + (sizeX%nbThreadsPerBlockDim == 0 ? 0 : 1); \
40  const unsigned long nbBlocksY = sizeY / nbThreadsPerBlockDim + (sizeY%nbThreadsPerBlockDim == 0 ? 0 : 1); \
41  const unsigned long nbBlocksZ = nbPlans / nbThreadsPerBlockDim + (nbPlans%nbThreadsPerBlockDim == 0 ? 0 : 1); \
42  dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ); \
43  const unsigned long blockSizeX = nbThreadsPerBlockDim; \
44  const unsigned long blockSizeY = nbThreadsPerBlockDim; \
45  const unsigned long blockSizeZ = (sizeZ < nbThreadsPerBlockDim ? sizeZ : nbThreadsPerBlockDim); \
46  dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);
47 
48 // Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
49 // for 2D processes. The difference with the generic macro is that nbBlocksZ and gridSizeZ are switched.
50 // It allows to apply a 2D process to several plan with only a 2D shared buffer
51 #define IPSDKCUDA_SET_GRID2D() \
52  const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
53  const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
54  const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
55  const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
56  const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
57  const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
58  const unsigned long nbPlans = sizeZ * sizeC * sizeT; \
59  const unsigned long nbBlocksX = sizeX / nbThreadsPerBlockDim + (sizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
60  const unsigned long nbBlocksY = sizeY / nbThreadsPerBlockDim + (sizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
61  const unsigned long nbBlocksZ = nbPlans; \
62  dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ); \
63  const unsigned long blockSizeX = nbThreadsPerBlockDim; \
64  const unsigned long blockSizeY = nbThreadsPerBlockDim; \
65  const unsigned long blockSizeZ = 1; \
66  dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);
67 
68 // Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
69 // for 3D processes. It allows to apply a 3D process to several plan with only a 3D shared buffer
70 #define IPSDKCUDA_SET_GRID3D() \
71  const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
72  const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
73  const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
74  const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
75  const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
76  const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
77  const unsigned long nbVolumes = sizeC * sizeT; \
78  const unsigned long blockSizeX = nbThreadsPerBlockDim; \
79  const unsigned long blockSizeY = nbThreadsPerBlockDim; \
80  const unsigned long blockSizeZ = (nbThreadsPerBlockDim > sizeZ ? sizeZ : nbThreadsPerBlockDim); \
81  dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \
82  const unsigned long nbBlocksX = sizeX / blockSizeX + (sizeX % blockSizeX == 0 ? 0 : 1); \
83  const unsigned long nbBlocksY = sizeY / blockSizeY + (sizeY % blockSizeY == 0 ? 0 : 1); \
84  const unsigned long nbBlocksZ = (sizeZ / blockSizeZ + (sizeZ % blockSizeZ == 0 ? 0 : 1)); \
85  dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ);
86 
87 // Check the error result returned by a cuda command
88 // This macro considers that a variable res, with type ipsdk::gpu::CudaResult, has been declared
89 #define IPSDKCUDA_CHECK_ERROR(err) \
90  if (err != cudaSuccess) { \
91  res._bResult = false; \
92  res._msg = cudaGetErrorString(err); \
93  }
94 
95 // Wait for the device synchronization and check the error
96 // If an error occurred, the CudaResult is modified to
97 // notify about the error
98 #define IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
99  cudaDeviceSynchronize(); \
100  { \
101  cudaError_t lastError = cudaGetLastError(); \
102  IPSDKCUDA_CHECK_ERROR(lastError) \
103  }
104 // Start instructoin block dedicated to free dynamically allocated memory
105 // by using the CudaRes variable res
106 #define IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
107  if(!res._bResult) {
108 
109 // End instruction block dedicated to free dynamically allocated memory
110 #define IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
111  return res;\
112  }
113 
114 // Declares a templated shared buffer
115 #define IPSDKCUDA_DECLARE_SHARED_BUFFER(T, pBuf) \
116  extern __shared__ char pSharedMem[]; \
117  T* pBuf = reinterpret_cast<T*>(pSharedMem);
118 
119 
120 // Declares and allocate the kernel information on device
121 #define IPSDKCUDA_TRANSFERT_KERNEL_INFO_2D(pKnlInfo_h, knlInfo_d, dataType) \
122  ipsdk::gpu::FilteringKernelInfo<dataType> knlInfo_d; \
123  knlInfo_d._nbData = pKnlInfo_h->_nbData; \
124  knlInfo_d._paddingX = pKnlInfo_h->_paddingX; \
125  knlInfo_d._paddingY = pKnlInfo_h->_paddingY; \
126  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsX, pKnlInfo_h->_nbData * sizeof(int))); \
127  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsY, pKnlInfo_h->_nbData * sizeof(int))); \
128  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
129  cudaFree(&knlInfo_d._pOffsetsX); \
130  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
131  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pCoefs, pKnlInfo_h->_nbData * sizeof(dataType))); \
132  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
133  cudaFree(&knlInfo_d._pOffsetsX); \
134  cudaFree(&knlInfo_d._pOffsetsY); \
135  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
136  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsX, pKnlInfo_h->_pOffsetsX, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
137  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
138  cudaFree(&knlInfo_d._pOffsetsX); \
139  cudaFree(&knlInfo_d._pOffsetsY); \
140  cudaFree(&knlInfo_d._pCoefs); \
141  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
142  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsY, pKnlInfo_h->_pOffsetsY, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
143  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
144  cudaFree(&knlInfo_d._pOffsetsX); \
145  cudaFree(&knlInfo_d._pOffsetsY); \
146  cudaFree(&knlInfo_d._pCoefs); \
147  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
148  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pCoefs, pKnlInfo_h->_pCoefs, pKnlInfo_h->_nbData * sizeof(dataType), cudaMemcpyHostToDevice)); \
149  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
150  cudaFree(&knlInfo_d._pOffsetsX); \
151  cudaFree(&knlInfo_d._pOffsetsY); \
152  cudaFree(&knlInfo_d._pCoefs); \
153  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
154 
155 // Free the kernel information on device
156 #define IPSDKCUDA_FREE_KERNEL_INFO_2D(knlInfo_d) \
157  cudaFree(knlInfo_d._pOffsetsX); \
158  cudaFree(knlInfo_d._pOffsetsY); \
159  cudaFree(knlInfo_d._pCoefs);
160 
161 // Declares and allocate the kernel information on device
162 #define IPSDKCUDA_TRANSFERT_KERNEL_INFO_3D(pKnlInfo_h, knlInfo_d, dataType) \
163  ipsdk::gpu::FilteringKernelInfo<dataType> knlInfo_d; \
164  knlInfo_d._nbData = pKnlInfo_h->_nbData; \
165  knlInfo_d._paddingX = pKnlInfo_h->_paddingX; \
166  knlInfo_d._paddingY = pKnlInfo_h->_paddingY; \
167  knlInfo_d._paddingZ = pKnlInfo_h->_paddingZ; \
168  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsX, pKnlInfo_h->_nbData * sizeof(int))); \
169  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsY, pKnlInfo_h->_nbData * sizeof(int))); \
170  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
171  cudaFree(&knlInfo_d._pOffsetsX); \
172  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
173  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsZ, pKnlInfo_h->_nbData * sizeof(int))); \
174  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
175  cudaFree(&knlInfo_d._pOffsetsX); \
176  cudaFree(&knlInfo_d._pOffsetsY); \
177  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
178  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pCoefs, pKnlInfo_h->_nbData * sizeof(dataType))); \
179  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
180  cudaFree(&knlInfo_d._pOffsetsX); \
181  cudaFree(&knlInfo_d._pOffsetsY); \
182  cudaFree(&knlInfo_d._pOffsetsZ); \
183  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
184  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsX, pKnlInfo_h->_pOffsetsX, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
185  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
186  cudaFree(&knlInfo_d._pOffsetsX); \
187  cudaFree(&knlInfo_d._pOffsetsY); \
188  cudaFree(&knlInfo_d._pOffsetsZ); \
189  cudaFree(&knlInfo_d._pCoefs); \
190  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
191  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsY, pKnlInfo_h->_pOffsetsY, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
192  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
193  cudaFree(&knlInfo_d._pOffsetsX); \
194  cudaFree(&knlInfo_d._pOffsetsY); \
195  cudaFree(&knlInfo_d._pOffsetsZ); \
196  cudaFree(&knlInfo_d._pCoefs); \
197  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
198  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsZ, pKnlInfo_h->_pOffsetsZ, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
199  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
200  cudaFree(&knlInfo_d._pOffsetsX); \
201  cudaFree(&knlInfo_d._pOffsetsY); \
202  cudaFree(&knlInfo_d._pOffsetsZ); \
203  cudaFree(&knlInfo_d._pCoefs); \
204  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
205  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pCoefs, pKnlInfo_h->_pCoefs, pKnlInfo_h->_nbData * sizeof(dataType), cudaMemcpyHostToDevice)); \
206  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
207  cudaFree(&knlInfo_d._pOffsetsX); \
208  cudaFree(&knlInfo_d._pOffsetsY); \
209  cudaFree(&knlInfo_d._pOffsetsZ); \
210  cudaFree(&knlInfo_d._pCoefs); \
211  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
212 
213 // Free the kernel information on device
214 #define IPSDKCUDA_FREE_KERNEL_INFO_3D(knlInfo_d) \
215  cudaFree(knlInfo_d._pOffsetsX); \
216  cudaFree(knlInfo_d._pOffsetsY); \
217  cudaFree(knlInfo_d._pOffsetsZ); \
218  cudaFree(knlInfo_d._pCoefs);
219 
220 
221 // Declares and allocate the structuring element information on device
222 #define IPSDKCUDA_TRANSFERT_SE_INFO_2D(pSEInfo_h, seInfo_d) \
223  ipsdk::gpu::StructuringElementInfo seInfo_d; \
224  seInfo_d._nbData = pSEInfo_h->_nbData; \
225  seInfo_d._paddingX = pSEInfo_h->_paddingX; \
226  seInfo_d._paddingY = pSEInfo_h->_paddingY; \
227  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsX, pSEInfo_h->_nbData * sizeof(int))); \
228  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsY, pSEInfo_h->_nbData * sizeof(int))); \
229  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
230  cudaFree(&seInfo_d._pOffsetsX); \
231  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
232  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsX, pSEInfo_h->_pOffsetsX, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
233  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
234  cudaFree(&seInfo_d._pOffsetsX); \
235  cudaFree(&seInfo_d._pOffsetsY); \
236  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
237  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsY, pSEInfo_h->_pOffsetsY, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
238  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
239  cudaFree(&seInfo_d._pOffsetsX); \
240  cudaFree(&seInfo_d._pOffsetsY); \
241  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
242 
243 // Free the kernel information on device
244 #define IPSDKCUDA_FREE_SE_INFO_2D(seInfo_d) \
245  cudaFree(seInfo_d._pOffsetsX); \
246  cudaFree(seInfo_d._pOffsetsY);
247 
248 // Declares and allcoate the kernel information on device
249 #define IPSDKCUDA_TRANSFERT_SE_INFO_3D(pSEInfo_h, seInfo_d) \
250  ipsdk::gpu::StructuringElementInfo seInfo_d; \
251  seInfo_d._nbData = pSEInfo_h->_nbData; \
252  seInfo_d._paddingX = pSEInfo_h->_paddingX; \
253  seInfo_d._paddingY = pSEInfo_h->_paddingY; \
254  seInfo_d._paddingZ = pSEInfo_h->_paddingZ; \
255  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsX, pSEInfo_h->_nbData * sizeof(int))); \
256  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsY, pSEInfo_h->_nbData * sizeof(int))); \
257  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
258  cudaFree(&seInfo_d._pOffsetsX); \
259  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
260  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsZ, pSEInfo_h->_nbData * sizeof(int))); \
261  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
262  cudaFree(&seInfo_d._pOffsetsX); \
263  cudaFree(&seInfo_d._pOffsetsY); \
264  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
265  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsX, pSEInfo_h->_pOffsetsX, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
266  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
267  cudaFree(&seInfo_d._pOffsetsX); \
268  cudaFree(&seInfo_d._pOffsetsY); \
269  cudaFree(&seInfo_d._pOffsetsZ); \
270  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
271  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsY, pSEInfo_h->_pOffsetsY, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
272  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
273  cudaFree(&seInfo_d._pOffsetsX); \
274  cudaFree(&seInfo_d._pOffsetsY); \
275  cudaFree(&seInfo_d._pOffsetsZ); \
276  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
277  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsZ, pSEInfo_h->_pOffsetsZ, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
278  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
279  cudaFree(&seInfo_d._pOffsetsX); \
280  cudaFree(&seInfo_d._pOffsetsY); \
281  cudaFree(&seInfo_d._pOffsetsZ); \
282  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
283 
284 // Free the kernel information on device
285 #define IPSDKCUDA_FREE_SE_INFO_3D(seInfo_d) \
286  cudaFree(seInfo_d._pOffsetsX); \
287  cudaFree(seInfo_d._pOffsetsY); \
288  cudaFree(seInfo_d._pOffsetsZ);
289 
290 // Compares 2 buffers (casted as const char*) and returns true if both buffers point to the same data
291 #define IPSDKCUDA_CHECK_INSITU(pBuf1, pBuf2) \
292  ((const char*)(*pBuf1)) == ((const char*)(*pBuf2))
293 
294 // Define the local following variables :
295 // - dim3 stripGridSize : device grid size used to set up kernels,
296 // - unsigned long stripSizeX, stripSizeY : stripSize along X and Y axis,
297 // - unsigned long gridSizeY : grid size (number of lines) along Z axis,
298 // - T* pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
299 // - const T** pTmpBuf_d = const_cast<const T**>(&pTmpBufData_d);
300 // Free the buffers contained in neighbourInfo_d if an error occurred
301 #define IPSDKCUDA_COMPUTE_STRIP_SIZE_2D(neighbourInfo_d, T, FREE_NEIGHBOURBUFFER_MACRO) \
302  const unsigned long nbBlocksInGrid = pLauncherInfo->_nbMultiProcessors; \
303  unsigned long nbLineBlocksPerGrid = nbBlocksInGrid / nbBlocksX + (nbBlocksInGrid % nbBlocksX != 0); \
304  nbLineBlocksPerGrid = (nbLineBlocksPerGrid < sizeY / blockSizeY ? nbLineBlocksPerGrid : sizeY / blockSizeY); \
305  const unsigned long gridSizeY = blockSizeY * nbLineBlocksPerGrid; \
306  const unsigned long nbLineGridsInImg = sizeY / gridSizeY + (sizeY % gridSizeY != 0); \
307  const unsigned long stripSizeX = sizeX + 2 * neighbourInfo_d._paddingX; \
308  const unsigned long stripSizeY = blockSizeY * nbLineBlocksPerGrid + 2 * neighbourInfo_d._paddingY; \
309  const unsigned long nbTmpData = stripSizeX * stripSizeY; \
310  T* pBuffer_d; \
311  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(pBuffer_d), nbTmpData * sizeof(T))); \
312  T** pTmpBufData_d; \
313  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, sizeof(T*))); \
314  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
315  IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
316  cudaFree(pBuffer_d); \
317  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
318  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, &pBuffer_d, sizeof(T*), cudaMemcpyHostToDevice)); \
319  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
320  IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
321  cudaFree(pBuffer_d); \
322  cudaFree(pTmpBufData_d); \
323  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
324  const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
325  const unsigned long nbBlocksX_strip = stripSizeX / nbThreadsPerBlockDim + (stripSizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
326  const unsigned long nbBlocksY_strip = stripSizeY / nbThreadsPerBlockDim + (stripSizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
327  const unsigned long nbBlocksZ_strip = 1; \
328  dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
329 
330 // Define the local following variables :
331 // - dim3 stripGridSize : device grid size used to set up kernels,
332 // - unsigned long stripSizeX, stripSizeY : stripSize along X and Y axis,
333 // - unsigned long gridSizeY : grid size (number of lines) along Z axis,
334 // - T** pTmpBuf_d : temporization buffer to store input data including the neighbourhood required by the kernel
335 // - const T** pTmpBuf_d = const_cast<const T**>(&pTmpBufData_d);
336 #define IPSDKCUDA_COMPUTE_STRIP_SIZE_2D_WITH_PADDING(paddingX, paddingY, T) \
337  const unsigned long nbBlocksInGrid = pLauncherInfo->_nbMultiProcessors; \
338  unsigned long nbLineBlocksPerGrid = nbBlocksInGrid / nbBlocksX + (nbBlocksInGrid % nbBlocksX != 0); \
339  nbLineBlocksPerGrid = (nbLineBlocksPerGrid < sizeY / blockSizeY ? nbLineBlocksPerGrid : sizeY / blockSizeY); \
340  const unsigned long gridSizeY = blockSizeY * nbLineBlocksPerGrid; \
341  const unsigned long nbLineGridsInImg = sizeY / gridSizeY + (sizeY % gridSizeY != 0); \
342  const unsigned long stripSizeX = sizeX + 2 * paddingX; \
343  const unsigned long stripSizeY = blockSizeY * nbLineBlocksPerGrid + 2 * paddingY; \
344  const unsigned long nbTmpData = stripSizeX * stripSizeY; \
345  T* pBuffer_d; \
346  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(pBuffer_d), nbTmpData * sizeof(T))); \
347  T** pTmpBufData_d; \
348  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, sizeof(T*))); \
349  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
350  cudaFree(pBuffer_d); \
351  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
352  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, &pBuffer_d, sizeof(T*), cudaMemcpyHostToDevice)); \
353  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
354  cudaFree(pBuffer_d); \
355  cudaFree(pTmpBufData_d); \
356  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
357  const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
358  const unsigned long nbBlocksX_strip = stripSizeX / nbThreadsPerBlockDim + (stripSizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
359  const unsigned long nbBlocksY_strip = stripSizeY / nbThreadsPerBlockDim + (stripSizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
360  const unsigned long nbBlocksZ_strip = 1; \
361  dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
362 
363 // Define the local following variables :
364 // - dim3 stripGridSize : device grid size used to set up kernels,
365 // - unsigned long stripSizeX, stripSizeY, stripSizeZ : stripSize along X , Y and Z axis,
366 // - T** pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
367 // - const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d);
368 // Free the buffers contained in knlInfo_d if an error occurred
369 #define IPSDKCUDA_COMPUTE_STRIP_SIZE_3D(neighbourInfo_d, T, FREE_NEIGHBOURBUFFER_MACRO) \
370  const unsigned long nbBlockPlansInVolume = sizeZ / blockSizeZ + (sizeZ % blockSizeZ != 0); \
371  const unsigned long stripSizeX = sizeX + 2 * neighbourInfo_d._paddingX; \
372  const unsigned long stripSizeY = sizeY + 2 * neighbourInfo_d._paddingY; \
373  const unsigned long stripSizeZ = blockSizeZ + 2 * neighbourInfo_d._paddingZ; \
374  const unsigned long nbTmpPlanData = stripSizeX * stripSizeY; \
375  std::vector<T*> vBuffers_h; \
376  vBuffers_h.resize(stripSizeZ); \
377  for (unsigned long z = 0; z < stripSizeZ; ++z) { \
378  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(vBuffers_h[z]), nbTmpPlanData * sizeof(T))); \
379  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
380  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
381  for (unsigned long iz = 0; iz < z; ++iz) \
382  cudaFree(&(vBuffers_h[iz])); \
383  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
384  } \
385  T** pTmpBufData_d; \
386  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, stripSizeZ * sizeof(T*))); \
387  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
388  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
389  for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
390  cudaFree(&(vBuffers_h[iz])); \
391  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
392  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, vBuffers_h.data(), \
393  stripSizeZ * sizeof(T*), cudaMemcpyHostToDevice)); \
394  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
395  IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
396  for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
397  cudaFree(&(vBuffers_h[iz])); \
398  cudaFree(pTmpBufData_d); \
399  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
400  const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
401  const unsigned long nbBlocksX_strip = stripSizeX / blockSizeX + (stripSizeX % blockSizeX == 0 ? 0 : 1); \
402  const unsigned long nbBlocksY_strip = stripSizeY / blockSizeY + (stripSizeY % blockSizeY == 0 ? 0 : 1); \
403  const unsigned long nbBlocksZ_strip = stripSizeZ / blockSizeZ + (stripSizeZ % blockSizeZ == 0 ? 0 : 1); \
404  dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
405 
406 // Define the local following variables :
407 // - dim3 stripGridSize : device grid size used to set up kernels,
408 // - unsigned long stripSizeX, stripSizeY, stripSizeZ : stripSize along X , Y and Z axis,
409 // - T** pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
410 // - const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d);
411 // Free the buffers contained in knlInfo_d if an error occurred
412 #define IPSDKCUDA_COMPUTE_STRIP_SIZE_3D_WITH_PADDING(paddingX, paddingY, paddingZ, T) \
413  const unsigned long nbBlockPlansInVolume = sizeZ / blockSizeZ + (sizeZ % blockSizeZ != 0); \
414  const unsigned long stripSizeX = sizeX + 2 * paddingX; \
415  const unsigned long stripSizeY = sizeY + 2 * paddingY; \
416  const unsigned long stripSizeZ = blockSizeZ + 2 * paddingZ; \
417  const unsigned long nbTmpPlanData = stripSizeX * stripSizeY; \
418  std::vector<T*> vBuffers_h; \
419  vBuffers_h.resize(stripSizeZ); \
420  for (unsigned long z = 0; z < stripSizeZ; ++z) { \
421  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(vBuffers_h[z]), nbTmpPlanData * sizeof(T))); \
422  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
423  for (unsigned long iz = 0; iz < z; ++iz) \
424  cudaFree(&(vBuffers_h[iz])); \
425  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
426  } \
427  T** pTmpBufData_d; \
428  IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, stripSizeZ * sizeof(T*))); \
429  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
430  for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
431  cudaFree(&(vBuffers_h[iz])); \
432  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
433  IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, vBuffers_h.data(), \
434  stripSizeZ * sizeof(T*), cudaMemcpyHostToDevice)); \
435  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
436  for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
437  cudaFree(&(vBuffers_h[iz])); \
438  cudaFree(pTmpBufData_d); \
439  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
440  const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
441  const unsigned long nbBlocksX_strip = stripSizeX / blockSizeX + (stripSizeX % blockSizeX == 0 ? 0 : 1); \
442  const unsigned long nbBlocksY_strip = stripSizeY / blockSizeY + (stripSizeY % blockSizeY == 0 ? 0 : 1); \
443  const unsigned long nbBlocksZ_strip = stripSizeZ / blockSizeZ + (stripSizeZ % blockSizeZ == 0 ? 0 : 1); \
444  dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
445 
446 // Start the process for the in situ case (the macro starts 2 FOR loops) and define :
447 // - unsigned long planIdx : the current plan to process,
448 // - unsigned long gridOffsetY : grid position in the image plan
449 #define IPSDKCUDA_START_INSITU_DATA_PARSING_2D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
450  for (unsigned long planIdx = 0; planIdx < nbPlans; ++planIdx) { \
451  res = ipsdk::util::gpu::copyInputToStrip2d_knlLauncher(pLauncherInfo, planIdx, \
452  stripSizeX, stripSizeY, \
453  pNeighbourInfo_h->_paddingX, pNeighbourInfo_h->_paddingY, \
454  0, 0, \
455  stripGridSize, blockSize, \
456  borderPolicyType, outOfImageValue, \
457  pBufIn, pTmpBufData_d); \
458  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
459  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
460  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
461  cudaFree(pBuffer_d); \
462  cudaFree(pTmpBufData_d); \
463  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
464  for (unsigned long curLineGridIdx = 0; curLineGridIdx < nbLineGridsInImg; ++curLineGridIdx) { \
465  const unsigned long gridOffsetY = curLineGridIdx * gridSizeY;
466 
467 // Start the process for the in situ case (the macro starts 2 FOR loops) and define :
468 // - unsigned long planIdx : the current plan to process,
469 // - unsigned long gridOffsetY : grid position in the image plan
470 #define IPSDKCUDA_START_INSITU_DATA_PARSING_2D_WITH_PADDING(paddingX, paddingY) \
471  for (unsigned long planIdx = 0; planIdx < nbPlans; ++planIdx) { \
472  res = ipsdk::util::gpu::copyInputToStrip2d_knlLauncher(pLauncherInfo, planIdx, \
473  stripSizeX, stripSizeY, \
474  paddingX, paddingY, \
475  0, 0, \
476  stripGridSize, blockSize, \
477  borderPolicyType, outOfImageValue, \
478  pBufIn, pTmpBufData_d); \
479  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
480  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
481  cudaFree(pTmpBufData_d); \
482  cudaFree(pBuffer_d); \
483  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
484  for (unsigned long curLineGridIdx = 0; curLineGridIdx < nbLineGridsInImg; ++curLineGridIdx) { \
485  const unsigned long gridOffsetY = curLineGridIdx * gridSizeY;
486 
487 
488 // Start the process for the in situ case (the macro starts 2 FOR loops) and define :
489 // - unsigned long volumeIdx : the current volume to process,
490 // - unsigned long curBlockPlanIdx : the plans for the current blocks grid positions to process,
491 // - unsigned long planOffset : the first plans used by the current blocks grid,
492 #define IPSDKCUDA_START_INSITU_DATA_PARSING_3D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
493  for (unsigned long volumeIdx = 0; volumeIdx < nbVolumes; ++volumeIdx) { \
494  res = ipsdk::util::gpu::copyInputToStrip3d_knlLauncher(pLauncherInfo, volumeIdx, \
495  stripSizeX, stripSizeY, stripSizeZ, \
496  pNeighbourInfo_h->_paddingX, \
497  pNeighbourInfo_h->_paddingY, \
498  pNeighbourInfo_h->_paddingZ, \
499  0, 0, stripGridSize, blockSize, \
500  borderPolicyType, outOfImageValue, \
501  pBufIn, pTmpBufData_d); \
502  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
503  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
504  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
505  for (unsigned long z = 0; z < stripSizeZ; ++z) \
506  cudaFree(vBuffers_h[z]); \
507  cudaFree(pTmpBufData_d); \
508  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
509  for (unsigned long curBlockPlanIdx = 0; curBlockPlanIdx < nbBlockPlansInVolume; ++curBlockPlanIdx) { \
510  const unsigned long planOffset = curBlockPlanIdx * blockSizeZ;
511 
512 
513 // Start the process for the in situ case (the macro starts 2 FOR loops) and define :
514 // - unsigned long volumeIdx : the current volume to process,
515 // - unsigned long curBlockPlanIdx : the plans for the current blocks grid positions to process,
516 // - unsigned long planOffset : the first plans used by the current blocks grid,
517 #define IPSDKCUDA_START_INSITU_DATA_PARSING_3D_WITH_PADDING(paddingX, paddingY, paddingZ) \
518  for (unsigned long volumeIdx = 0; volumeIdx < nbVolumes; ++volumeIdx) { \
519  res = ipsdk::util::gpu::copyInputToStrip3d_knlLauncher(pLauncherInfo, volumeIdx, \
520  stripSizeX, stripSizeY, stripSizeZ, \
521  paddingX, paddingY, paddingZ, \
522  0, 0, stripGridSize, blockSize, \
523  borderPolicyType, outOfImageValue, \
524  pBufIn, pTmpBufData_d); \
525  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
526  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
527  for (unsigned long z = 0; z < stripSizeZ; ++z) \
528  cudaFree(vBuffers_h[z]); \
529  cudaFree(pTmpBufData_d); \
530  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
531  for (unsigned long curBlockPlanIdx = 0; curBlockPlanIdx < nbBlockPlansInVolume; ++curBlockPlanIdx) { \
532  const unsigned long planOffset = curBlockPlanIdx * blockSizeZ;
533 
534 // End the process for the in situ case :
535 // - roll up the input data thanks to pTmpBuf_d
536 // - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_2D
537 // - free the temporization buffer pTmpBuf_d
538 #define IPSDKCUDA_END_INSITU_DATA_PARSING_2D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
539  res = ipsdk::util::gpu::stripRollUp2d_knlLauncher(pLauncherInfo, \
540  pNeighbourInfo_h->_paddingX, pNeighbourInfo_h->_paddingY, \
541  stripSizeX, stripSizeY, \
542  stripGridSize, blockSize, \
543  planIdx, (curLineGridIdx + 1) * gridSizeY, \
544  borderPolicyType, outOfImageValue, \
545  pBufIn, pTmpBufData_d); \
546  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
547  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
548  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
549  cudaFree(pTmpBuf_d); \
550  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
551  } \
552  } \
553  cudaFree(pTmpBuf_d);
554 
555 // End the process for the in situ case :
556 // - roll up the input data thanks to pTmpBuf_d
557 // - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_2D
558 // - free the temporization buffer pTmpBuf_d
559 #define IPSDKCUDA_END_INSITU_DATA_PARSING_2D_WITH_PADDING(paddingX, paddingY) \
560  res = ipsdk::util::gpu::stripRollUp2d_knlLauncher(pLauncherInfo, \
561  paddingX, paddingY, \
562  stripSizeX, stripSizeY, \
563  stripGridSize, blockSize, \
564  planIdx, (curLineGridIdx + 1) * gridSizeY, \
565  borderPolicyType, outOfImageValue, \
566  pBufIn, pTmpBufData_d); \
567  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
568  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
569  cudaFree(pTmpBuf_d); \
570  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
571  } \
572  } \
573  cudaFree(pTmpBuf_d);
574 
575 // End the process for the in situ case :
576 // - roll up the input data thanks to pTmpBuf_d
577 // - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_3D
578 // - free the temporization buffers pTmpBuf_d and vBuffers_h
579 #define IPSDKCUDA_END_INSITU_DATA_PARSING_3D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
580  res = ipsdk::util::gpu::stripRollUp3d_knlLauncher(pLauncherInfo, \
581  pNeighbourInfo_h->_paddingX, \
582  pNeighbourInfo_h->_paddingY, \
583  pNeighbourInfo_h->_paddingZ, \
584  stripSizeX, stripSizeY, stripSizeZ, \
585  stripGridSize, blockSize, \
586  volumeIdx, planOffset + blockSizeZ, \
587  borderPolicyType, outOfImageValue, \
588  pBufIn, pTmpBufData_d); \
589  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
590  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
591  FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
592  for (unsigned long z = 0; z < stripSizeZ; ++z) \
593  cudaFree(vBuffers_h[z]); \
594  cudaFree(pTmpBufData_d); \
595  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
596  } \
597  } \
598  for (unsigned long z = 0; z < stripSizeZ; ++z) \
599  cudaFree(vBuffers_h[z]); \
600  cudaFree(pTmpBufData_d);
601 
602 // End the process for the in situ case :
603 // - roll up the input data thanks to pTmpBuf_d
604 // - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_3D
605 // - free the temporization buffers pTmpBuf_d and vBuffers_h
606 #define IPSDKCUDA_END_INSITU_DATA_PARSING_3D_WITH_PADDING(paddingX, paddingY, paddingZ) \
607  res = ipsdk::util::gpu::stripRollUp3d_knlLauncher(pLauncherInfo, \
608  paddingX, paddingY, paddingZ, \
609  stripSizeX, stripSizeY, stripSizeZ, \
610  stripGridSize, blockSize, \
611  volumeIdx, planOffset + blockSizeZ, \
612  borderPolicyType, outOfImageValue, \
613  pBufIn, pTmpBufData_d); \
614  IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
615  IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
616  for (unsigned long z = 0; z < stripSizeZ; ++z) \
617  cudaFree(vBuffers_h[z]); \
618  cudaFree(pTmpBufData_d); \
619  IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
620  } \
621  } \
622  for (unsigned long z = 0; z < stripSizeZ; ++z) \
623  cudaFree(vBuffers_h[z]); \
624  cudaFree(pTmpBufData_d);
625 
626 #endif // __IPSDKIMAGEPROCESSING_IMAGEPROCESSINGALGORITHMSRCMACROS_H__