IPSDK
4_1_0_2
IPSDK : Image Processing Software Development Kit
IPSDK_Nightly
include
partner
IPSDKImageProcessing
Algorithm
CudaSrcMacros.h
Go to the documentation of this file.
1
// CudaSrcMacros.h:
3
// ------------------------------------
4
//
14
15
#ifndef __IPSDKIMAGEPROCESSING_CUDASRCMACROS_H__
16
#define __IPSDKIMAGEPROCESSING_CUDASRCMACROS_H__
17
18
// Trick to explicitly ignore a parameter
19
#define IPSDKCUDA_UNUSED_VAR(x) (void)(x)
20
21
// Split the data from the ImageSizeInfo structure to 5 UInt32 scalars
22
// The macro input argument is a pointer to an instance of ImageSizeInfo
23
#define IPSDKCUDA_SPLIT_SIZES_FROM_STRUCT(imgSizeInfo) \
24
const unsigned long sizeX = imgSizeInfo._sizeX; \
25
const unsigned long sizeY = imgSizeInfo._sizeY; \
26
const unsigned long sizeZ = imgSizeInfo._sizeZ; \
27
const unsigned long sizeC = imgSizeInfo._sizeC; \
28
const unsigned long sizeT = imgSizeInfo._sizeT;
29
30
// Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
31
#define IPSDKCUDA_SET_GRID() \
32
const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
33
const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
34
const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
35
const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
36
const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
37
const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
38
const unsigned long nbPlans = sizeZ * sizeC * sizeT; \
39
const unsigned long nbBlocksX = sizeX / nbThreadsPerBlockDim + (sizeX%nbThreadsPerBlockDim == 0 ? 0 : 1); \
40
const unsigned long nbBlocksY = sizeY / nbThreadsPerBlockDim + (sizeY%nbThreadsPerBlockDim == 0 ? 0 : 1); \
41
const unsigned long nbBlocksZ = nbPlans / nbThreadsPerBlockDim + (nbPlans%nbThreadsPerBlockDim == 0 ? 0 : 1); \
42
dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ); \
43
const unsigned long blockSizeX = nbThreadsPerBlockDim; \
44
const unsigned long blockSizeY = nbThreadsPerBlockDim; \
45
const unsigned long blockSizeZ = (sizeZ < nbThreadsPerBlockDim ? sizeZ : nbThreadsPerBlockDim); \
46
dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);
47
48
// Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
49
// for 2D processes. The difference with the generic macro is that nbBlocksZ and gridSizeZ are switched.
50
// It allows to apply a 2D process to several plan with only a 2D shared buffer
51
#define IPSDKCUDA_SET_GRID2D() \
52
const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
53
const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
54
const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
55
const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
56
const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
57
const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
58
const unsigned long nbPlans = sizeZ * sizeC * sizeT; \
59
const unsigned long nbBlocksX = sizeX / nbThreadsPerBlockDim + (sizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
60
const unsigned long nbBlocksY = sizeY / nbThreadsPerBlockDim + (sizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
61
const unsigned long nbBlocksZ = nbPlans; \
62
dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ); \
63
const unsigned long blockSizeX = nbThreadsPerBlockDim; \
64
const unsigned long blockSizeY = nbThreadsPerBlockDim; \
65
const unsigned long blockSizeZ = 1; \
66
dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);
67
68
// Define the number of threads per block and the number of bloks in the grid according to nbThreadsPerBlockDim
69
// for 3D processes. It allows to apply a 3D process to several plan with only a 3D shared buffer
70
#define IPSDKCUDA_SET_GRID3D() \
71
const unsigned short nbThreadsPerBlockDim = pLauncherInfo->_blockSize; \
72
const unsigned long sizeX = pLauncherInfo->_imgSizeInfo._sizeX; \
73
const unsigned long sizeY = pLauncherInfo->_imgSizeInfo._sizeY; \
74
const unsigned long sizeZ = pLauncherInfo->_imgSizeInfo._sizeZ; \
75
const unsigned long sizeC = pLauncherInfo->_imgSizeInfo._sizeC; \
76
const unsigned long sizeT = pLauncherInfo->_imgSizeInfo._sizeT; \
77
const unsigned long nbVolumes = sizeC * sizeT; \
78
const unsigned long blockSizeX = nbThreadsPerBlockDim; \
79
const unsigned long blockSizeY = nbThreadsPerBlockDim; \
80
const unsigned long blockSizeZ = (nbThreadsPerBlockDim > sizeZ ? sizeZ : nbThreadsPerBlockDim); \
81
dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \
82
const unsigned long nbBlocksX = sizeX / blockSizeX + (sizeX % blockSizeX == 0 ? 0 : 1); \
83
const unsigned long nbBlocksY = sizeY / blockSizeY + (sizeY % blockSizeY == 0 ? 0 : 1); \
84
const unsigned long nbBlocksZ = (sizeZ / blockSizeZ + (sizeZ % blockSizeZ == 0 ? 0 : 1)); \
85
dim3 gridSize(nbBlocksX, nbBlocksY, nbBlocksZ);
86
87
// Check the error result returned by a cuda command
88
// This macro considers that a variable res, with type ipsdk::gpu::CudaResult, has been declared
89
#define IPSDKCUDA_CHECK_ERROR(err) \
90
if (err != cudaSuccess) { \
91
res._bResult = false; \
92
res._msg = cudaGetErrorString(err); \
93
}
94
95
// Wait for the device synchronization and check the error
96
// If an error occurred, the CudaResult is modified to
97
// notify about the error
98
#define IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
99
cudaDeviceSynchronize(); \
100
{ \
101
cudaError_t lastError = cudaGetLastError(); \
102
IPSDKCUDA_CHECK_ERROR(lastError) \
103
}
104
// Start instructoin block dedicated to free dynamically allocated memory
105
// by using the CudaRes variable res
106
#define IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
107
if(!res._bResult) {
108
109
// End instruction block dedicated to free dynamically allocated memory
110
#define IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
111
return res;\
112
}
113
114
// Declares a templated shared buffer
115
#define IPSDKCUDA_DECLARE_SHARED_BUFFER(T, pBuf) \
116
extern __shared__ char pSharedMem[]; \
117
T* pBuf = reinterpret_cast<T*>(pSharedMem);
118
119
120
// Declares and allocate the kernel information on device
121
#define IPSDKCUDA_TRANSFERT_KERNEL_INFO_2D(pKnlInfo_h, knlInfo_d, dataType) \
122
ipsdk::gpu::FilteringKernelInfo<dataType> knlInfo_d; \
123
knlInfo_d._nbData = pKnlInfo_h->_nbData; \
124
knlInfo_d._paddingX = pKnlInfo_h->_paddingX; \
125
knlInfo_d._paddingY = pKnlInfo_h->_paddingY; \
126
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsX, pKnlInfo_h->_nbData * sizeof(int))); \
127
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsY, pKnlInfo_h->_nbData * sizeof(int))); \
128
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
129
cudaFree(&knlInfo_d._pOffsetsX); \
130
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
131
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pCoefs, pKnlInfo_h->_nbData * sizeof(dataType))); \
132
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
133
cudaFree(&knlInfo_d._pOffsetsX); \
134
cudaFree(&knlInfo_d._pOffsetsY); \
135
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
136
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsX, pKnlInfo_h->_pOffsetsX, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
137
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
138
cudaFree(&knlInfo_d._pOffsetsX); \
139
cudaFree(&knlInfo_d._pOffsetsY); \
140
cudaFree(&knlInfo_d._pCoefs); \
141
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
142
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsY, pKnlInfo_h->_pOffsetsY, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
143
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
144
cudaFree(&knlInfo_d._pOffsetsX); \
145
cudaFree(&knlInfo_d._pOffsetsY); \
146
cudaFree(&knlInfo_d._pCoefs); \
147
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
148
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pCoefs, pKnlInfo_h->_pCoefs, pKnlInfo_h->_nbData * sizeof(dataType), cudaMemcpyHostToDevice)); \
149
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
150
cudaFree(&knlInfo_d._pOffsetsX); \
151
cudaFree(&knlInfo_d._pOffsetsY); \
152
cudaFree(&knlInfo_d._pCoefs); \
153
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
154
155
// Free the kernel information on device
156
#define IPSDKCUDA_FREE_KERNEL_INFO_2D(knlInfo_d) \
157
cudaFree(knlInfo_d._pOffsetsX); \
158
cudaFree(knlInfo_d._pOffsetsY); \
159
cudaFree(knlInfo_d._pCoefs);
160
161
// Declares and allocate the kernel information on device
162
#define IPSDKCUDA_TRANSFERT_KERNEL_INFO_3D(pKnlInfo_h, knlInfo_d, dataType) \
163
ipsdk::gpu::FilteringKernelInfo<dataType> knlInfo_d; \
164
knlInfo_d._nbData = pKnlInfo_h->_nbData; \
165
knlInfo_d._paddingX = pKnlInfo_h->_paddingX; \
166
knlInfo_d._paddingY = pKnlInfo_h->_paddingY; \
167
knlInfo_d._paddingZ = pKnlInfo_h->_paddingZ; \
168
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsX, pKnlInfo_h->_nbData * sizeof(int))); \
169
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsY, pKnlInfo_h->_nbData * sizeof(int))); \
170
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
171
cudaFree(&knlInfo_d._pOffsetsX); \
172
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
173
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pOffsetsZ, pKnlInfo_h->_nbData * sizeof(int))); \
174
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
175
cudaFree(&knlInfo_d._pOffsetsX); \
176
cudaFree(&knlInfo_d._pOffsetsY); \
177
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
178
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&knlInfo_d._pCoefs, pKnlInfo_h->_nbData * sizeof(dataType))); \
179
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
180
cudaFree(&knlInfo_d._pOffsetsX); \
181
cudaFree(&knlInfo_d._pOffsetsY); \
182
cudaFree(&knlInfo_d._pOffsetsZ); \
183
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
184
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsX, pKnlInfo_h->_pOffsetsX, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
185
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
186
cudaFree(&knlInfo_d._pOffsetsX); \
187
cudaFree(&knlInfo_d._pOffsetsY); \
188
cudaFree(&knlInfo_d._pOffsetsZ); \
189
cudaFree(&knlInfo_d._pCoefs); \
190
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
191
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsY, pKnlInfo_h->_pOffsetsY, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
192
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
193
cudaFree(&knlInfo_d._pOffsetsX); \
194
cudaFree(&knlInfo_d._pOffsetsY); \
195
cudaFree(&knlInfo_d._pOffsetsZ); \
196
cudaFree(&knlInfo_d._pCoefs); \
197
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
198
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pOffsetsZ, pKnlInfo_h->_pOffsetsZ, pKnlInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
199
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
200
cudaFree(&knlInfo_d._pOffsetsX); \
201
cudaFree(&knlInfo_d._pOffsetsY); \
202
cudaFree(&knlInfo_d._pOffsetsZ); \
203
cudaFree(&knlInfo_d._pCoefs); \
204
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
205
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(knlInfo_d._pCoefs, pKnlInfo_h->_pCoefs, pKnlInfo_h->_nbData * sizeof(dataType), cudaMemcpyHostToDevice)); \
206
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
207
cudaFree(&knlInfo_d._pOffsetsX); \
208
cudaFree(&knlInfo_d._pOffsetsY); \
209
cudaFree(&knlInfo_d._pOffsetsZ); \
210
cudaFree(&knlInfo_d._pCoefs); \
211
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
212
213
// Free the kernel information on device
214
#define IPSDKCUDA_FREE_KERNEL_INFO_3D(knlInfo_d) \
215
cudaFree(knlInfo_d._pOffsetsX); \
216
cudaFree(knlInfo_d._pOffsetsY); \
217
cudaFree(knlInfo_d._pOffsetsZ); \
218
cudaFree(knlInfo_d._pCoefs);
219
220
221
// Declares and allocate the structuring element information on device
222
#define IPSDKCUDA_TRANSFERT_SE_INFO_2D(pSEInfo_h, seInfo_d) \
223
ipsdk::gpu::StructuringElementInfo seInfo_d; \
224
seInfo_d._nbData = pSEInfo_h->_nbData; \
225
seInfo_d._paddingX = pSEInfo_h->_paddingX; \
226
seInfo_d._paddingY = pSEInfo_h->_paddingY; \
227
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsX, pSEInfo_h->_nbData * sizeof(int))); \
228
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsY, pSEInfo_h->_nbData * sizeof(int))); \
229
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
230
cudaFree(&seInfo_d._pOffsetsX); \
231
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
232
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsX, pSEInfo_h->_pOffsetsX, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
233
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
234
cudaFree(&seInfo_d._pOffsetsX); \
235
cudaFree(&seInfo_d._pOffsetsY); \
236
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
237
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsY, pSEInfo_h->_pOffsetsY, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
238
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
239
cudaFree(&seInfo_d._pOffsetsX); \
240
cudaFree(&seInfo_d._pOffsetsY); \
241
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
242
243
// Free the kernel information on device
244
#define IPSDKCUDA_FREE_SE_INFO_2D(seInfo_d) \
245
cudaFree(seInfo_d._pOffsetsX); \
246
cudaFree(seInfo_d._pOffsetsY);
247
248
// Declares and allcoate the kernel information on device
249
#define IPSDKCUDA_TRANSFERT_SE_INFO_3D(pSEInfo_h, seInfo_d) \
250
ipsdk::gpu::StructuringElementInfo seInfo_d; \
251
seInfo_d._nbData = pSEInfo_h->_nbData; \
252
seInfo_d._paddingX = pSEInfo_h->_paddingX; \
253
seInfo_d._paddingY = pSEInfo_h->_paddingY; \
254
seInfo_d._paddingZ = pSEInfo_h->_paddingZ; \
255
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsX, pSEInfo_h->_nbData * sizeof(int))); \
256
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsY, pSEInfo_h->_nbData * sizeof(int))); \
257
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
258
cudaFree(&seInfo_d._pOffsetsX); \
259
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
260
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void **)&seInfo_d._pOffsetsZ, pSEInfo_h->_nbData * sizeof(int))); \
261
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
262
cudaFree(&seInfo_d._pOffsetsX); \
263
cudaFree(&seInfo_d._pOffsetsY); \
264
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
265
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsX, pSEInfo_h->_pOffsetsX, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
266
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
267
cudaFree(&seInfo_d._pOffsetsX); \
268
cudaFree(&seInfo_d._pOffsetsY); \
269
cudaFree(&seInfo_d._pOffsetsZ); \
270
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
271
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsY, pSEInfo_h->_pOffsetsY, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
272
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
273
cudaFree(&seInfo_d._pOffsetsX); \
274
cudaFree(&seInfo_d._pOffsetsY); \
275
cudaFree(&seInfo_d._pOffsetsZ); \
276
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
277
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(seInfo_d._pOffsetsZ, pSEInfo_h->_pOffsetsZ, pSEInfo_h->_nbData * sizeof(int), cudaMemcpyHostToDevice)); \
278
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
279
cudaFree(&seInfo_d._pOffsetsX); \
280
cudaFree(&seInfo_d._pOffsetsY); \
281
cudaFree(&seInfo_d._pOffsetsZ); \
282
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR()
283
284
// Free the kernel information on device
285
#define IPSDKCUDA_FREE_SE_INFO_3D(seInfo_d) \
286
cudaFree(seInfo_d._pOffsetsX); \
287
cudaFree(seInfo_d._pOffsetsY); \
288
cudaFree(seInfo_d._pOffsetsZ);
289
290
// Compares 2 buffers (casted as const char*) and returns true if both buffers point to the same data
291
#define IPSDKCUDA_CHECK_INSITU(pBuf1, pBuf2) \
292
((const char*)(*pBuf1)) == ((const char*)(*pBuf2))
293
294
// Define the local following variables :
295
// - dim3 stripGridSize : device grid size used to set up kernels,
296
// - unsigned long stripSizeX, stripSizeY : stripSize along X and Y axis,
297
// - unsigned long gridSizeY : grid size (number of lines) along Z axis,
298
// - T* pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
299
// - const T** pTmpBuf_d = const_cast<const T**>(&pTmpBufData_d);
300
// Free the buffers contained in neighbourInfo_d if an error occurred
301
#define IPSDKCUDA_COMPUTE_STRIP_SIZE_2D(neighbourInfo_d, T, FREE_NEIGHBOURBUFFER_MACRO) \
302
const unsigned long nbBlocksInGrid = pLauncherInfo->_nbMultiProcessors; \
303
unsigned long nbLineBlocksPerGrid = nbBlocksInGrid / nbBlocksX + (nbBlocksInGrid % nbBlocksX != 0); \
304
nbLineBlocksPerGrid = (nbLineBlocksPerGrid < sizeY / blockSizeY ? nbLineBlocksPerGrid : sizeY / blockSizeY); \
305
const unsigned long gridSizeY = blockSizeY * nbLineBlocksPerGrid; \
306
const unsigned long nbLineGridsInImg = sizeY / gridSizeY + (sizeY % gridSizeY != 0); \
307
const unsigned long stripSizeX = sizeX + 2 * neighbourInfo_d._paddingX; \
308
const unsigned long stripSizeY = blockSizeY * nbLineBlocksPerGrid + 2 * neighbourInfo_d._paddingY; \
309
const unsigned long nbTmpData = stripSizeX * stripSizeY; \
310
T* pBuffer_d; \
311
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(pBuffer_d), nbTmpData * sizeof(T))); \
312
T** pTmpBufData_d; \
313
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, sizeof(T*))); \
314
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
315
IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
316
cudaFree(pBuffer_d); \
317
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
318
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, &pBuffer_d, sizeof(T*), cudaMemcpyHostToDevice)); \
319
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
320
IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
321
cudaFree(pBuffer_d); \
322
cudaFree(pTmpBufData_d); \
323
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
324
const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
325
const unsigned long nbBlocksX_strip = stripSizeX / nbThreadsPerBlockDim + (stripSizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
326
const unsigned long nbBlocksY_strip = stripSizeY / nbThreadsPerBlockDim + (stripSizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
327
const unsigned long nbBlocksZ_strip = 1; \
328
dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
329
330
// Define the local following variables :
331
// - dim3 stripGridSize : device grid size used to set up kernels,
332
// - unsigned long stripSizeX, stripSizeY : stripSize along X and Y axis,
333
// - unsigned long gridSizeY : grid size (number of lines) along Z axis,
334
// - T** pTmpBuf_d : temporization buffer to store input data including the neighbourhood required by the kernel
335
// - const T** pTmpBuf_d = const_cast<const T**>(&pTmpBufData_d);
336
#define IPSDKCUDA_COMPUTE_STRIP_SIZE_2D_WITH_PADDING(paddingX, paddingY, T) \
337
const unsigned long nbBlocksInGrid = pLauncherInfo->_nbMultiProcessors; \
338
unsigned long nbLineBlocksPerGrid = nbBlocksInGrid / nbBlocksX + (nbBlocksInGrid % nbBlocksX != 0); \
339
nbLineBlocksPerGrid = (nbLineBlocksPerGrid < sizeY / blockSizeY ? nbLineBlocksPerGrid : sizeY / blockSizeY); \
340
const unsigned long gridSizeY = blockSizeY * nbLineBlocksPerGrid; \
341
const unsigned long nbLineGridsInImg = sizeY / gridSizeY + (sizeY % gridSizeY != 0); \
342
const unsigned long stripSizeX = sizeX + 2 * paddingX; \
343
const unsigned long stripSizeY = blockSizeY * nbLineBlocksPerGrid + 2 * paddingY; \
344
const unsigned long nbTmpData = stripSizeX * stripSizeY; \
345
T* pBuffer_d; \
346
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(pBuffer_d), nbTmpData * sizeof(T))); \
347
T** pTmpBufData_d; \
348
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, sizeof(T*))); \
349
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
350
cudaFree(pBuffer_d); \
351
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
352
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, &pBuffer_d, sizeof(T*), cudaMemcpyHostToDevice)); \
353
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
354
cudaFree(pBuffer_d); \
355
cudaFree(pTmpBufData_d); \
356
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
357
const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
358
const unsigned long nbBlocksX_strip = stripSizeX / nbThreadsPerBlockDim + (stripSizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
359
const unsigned long nbBlocksY_strip = stripSizeY / nbThreadsPerBlockDim + (stripSizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
360
const unsigned long nbBlocksZ_strip = 1; \
361
dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
362
363
// Define the local following variables :
364
// - dim3 stripGridSize : device grid size used to set up kernels,
365
// - unsigned long stripSizeX, stripSizeY, stripSizeZ : stripSize along X , Y and Z axis,
366
// - T** pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
367
// - const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d);
368
// Free the buffers contained in knlInfo_d if an error occurred
369
#define IPSDKCUDA_COMPUTE_STRIP_SIZE_3D(neighbourInfo_d, T, FREE_NEIGHBOURBUFFER_MACRO) \
370
const unsigned long nbBlockPlansInVolume = sizeZ / blockSizeZ + (sizeZ % blockSizeZ != 0); \
371
const unsigned long stripSizeX = sizeX + 2 * neighbourInfo_d._paddingX; \
372
const unsigned long stripSizeY = sizeY + 2 * neighbourInfo_d._paddingY; \
373
const unsigned long stripSizeZ = blockSizeZ + 2 * neighbourInfo_d._paddingZ; \
374
const unsigned long nbTmpPlanData = stripSizeX * stripSizeY; \
375
std::vector<T*> vBuffers_h; \
376
vBuffers_h.resize(stripSizeZ); \
377
for (unsigned long z = 0; z < stripSizeZ; ++z) { \
378
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(vBuffers_h[z]), nbTmpPlanData * sizeof(T))); \
379
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
380
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
381
for (unsigned long iz = 0; iz < z; ++iz) \
382
cudaFree(&(vBuffers_h[iz])); \
383
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
384
} \
385
T** pTmpBufData_d; \
386
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, stripSizeZ * sizeof(T*))); \
387
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
388
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
389
for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
390
cudaFree(&(vBuffers_h[iz])); \
391
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
392
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, vBuffers_h.data(), \
393
stripSizeZ * sizeof(T*), cudaMemcpyHostToDevice)); \
394
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
395
IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
396
for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
397
cudaFree(&(vBuffers_h[iz])); \
398
cudaFree(pTmpBufData_d); \
399
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
400
const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
401
const unsigned long nbBlocksX_strip = stripSizeX / blockSizeX + (stripSizeX % blockSizeX == 0 ? 0 : 1); \
402
const unsigned long nbBlocksY_strip = stripSizeY / blockSizeY + (stripSizeY % blockSizeY == 0 ? 0 : 1); \
403
const unsigned long nbBlocksZ_strip = stripSizeZ / blockSizeZ + (stripSizeZ % blockSizeZ == 0 ? 0 : 1); \
404
dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
405
406
// Define the local following variables :
407
// - dim3 stripGridSize : device grid size used to set up kernels,
408
// - unsigned long stripSizeX, stripSizeY, stripSizeZ : stripSize along X , Y and Z axis,
409
// - T** pTmpBufData_d : temporization buffer to store input data including the neighbourhood required by the kernel
410
// - const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d);
411
// Free the buffers contained in knlInfo_d if an error occurred
412
#define IPSDKCUDA_COMPUTE_STRIP_SIZE_3D_WITH_PADDING(paddingX, paddingY, paddingZ, T) \
413
const unsigned long nbBlockPlansInVolume = sizeZ / blockSizeZ + (sizeZ % blockSizeZ != 0); \
414
const unsigned long stripSizeX = sizeX + 2 * paddingX; \
415
const unsigned long stripSizeY = sizeY + 2 * paddingY; \
416
const unsigned long stripSizeZ = blockSizeZ + 2 * paddingZ; \
417
const unsigned long nbTmpPlanData = stripSizeX * stripSizeY; \
418
std::vector<T*> vBuffers_h; \
419
vBuffers_h.resize(stripSizeZ); \
420
for (unsigned long z = 0; z < stripSizeZ; ++z) { \
421
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(vBuffers_h[z]), nbTmpPlanData * sizeof(T))); \
422
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
423
for (unsigned long iz = 0; iz < z; ++iz) \
424
cudaFree(&(vBuffers_h[iz])); \
425
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
426
} \
427
T** pTmpBufData_d; \
428
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, stripSizeZ * sizeof(T*))); \
429
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
430
for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
431
cudaFree(&(vBuffers_h[iz])); \
432
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
433
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, vBuffers_h.data(), \
434
stripSizeZ * sizeof(T*), cudaMemcpyHostToDevice)); \
435
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
436
for (unsigned long iz = 0; iz < stripSizeZ; ++iz) \
437
cudaFree(&(vBuffers_h[iz])); \
438
cudaFree(pTmpBufData_d); \
439
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
440
const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
441
const unsigned long nbBlocksX_strip = stripSizeX / blockSizeX + (stripSizeX % blockSizeX == 0 ? 0 : 1); \
442
const unsigned long nbBlocksY_strip = stripSizeY / blockSizeY + (stripSizeY % blockSizeY == 0 ? 0 : 1); \
443
const unsigned long nbBlocksZ_strip = stripSizeZ / blockSizeZ + (stripSizeZ % blockSizeZ == 0 ? 0 : 1); \
444
dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);
445
446
// Start the process for the in situ case (the macro starts 2 FOR loops) and define :
447
// - unsigned long planIdx : the current plan to process,
448
// - unsigned long gridOffsetY : grid position in the image plan
449
#define IPSDKCUDA_START_INSITU_DATA_PARSING_2D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
450
for (unsigned long planIdx = 0; planIdx < nbPlans; ++planIdx) { \
451
res = ipsdk::util::gpu::copyInputToStrip2d_knlLauncher(pLauncherInfo, planIdx, \
452
stripSizeX, stripSizeY, \
453
pNeighbourInfo_h->_paddingX, pNeighbourInfo_h->_paddingY, \
454
0, 0, \
455
stripGridSize, blockSize, \
456
borderPolicyType, outOfImageValue, \
457
pBufIn, pTmpBufData_d); \
458
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
459
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
460
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
461
cudaFree(pBuffer_d); \
462
cudaFree(pTmpBufData_d); \
463
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
464
for (unsigned long curLineGridIdx = 0; curLineGridIdx < nbLineGridsInImg; ++curLineGridIdx) { \
465
const unsigned long gridOffsetY = curLineGridIdx * gridSizeY;
466
467
// Start the process for the in situ case (the macro starts 2 FOR loops) and define :
468
// - unsigned long planIdx : the current plan to process,
469
// - unsigned long gridOffsetY : grid position in the image plan
470
#define IPSDKCUDA_START_INSITU_DATA_PARSING_2D_WITH_PADDING(paddingX, paddingY) \
471
for (unsigned long planIdx = 0; planIdx < nbPlans; ++planIdx) { \
472
res = ipsdk::util::gpu::copyInputToStrip2d_knlLauncher(pLauncherInfo, planIdx, \
473
stripSizeX, stripSizeY, \
474
paddingX, paddingY, \
475
0, 0, \
476
stripGridSize, blockSize, \
477
borderPolicyType, outOfImageValue, \
478
pBufIn, pTmpBufData_d); \
479
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
480
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
481
cudaFree(pTmpBufData_d); \
482
cudaFree(pBuffer_d); \
483
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
484
for (unsigned long curLineGridIdx = 0; curLineGridIdx < nbLineGridsInImg; ++curLineGridIdx) { \
485
const unsigned long gridOffsetY = curLineGridIdx * gridSizeY;
486
487
488
// Start the process for the in situ case (the macro starts 2 FOR loops) and define :
489
// - unsigned long volumeIdx : the current volume to process,
490
// - unsigned long curBlockPlanIdx : the plans for the current blocks grid positions to process,
491
// - unsigned long planOffset : the first plans used by the current blocks grid,
492
#define IPSDKCUDA_START_INSITU_DATA_PARSING_3D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
493
for (unsigned long volumeIdx = 0; volumeIdx < nbVolumes; ++volumeIdx) { \
494
res = ipsdk::util::gpu::copyInputToStrip3d_knlLauncher(pLauncherInfo, volumeIdx, \
495
stripSizeX, stripSizeY, stripSizeZ, \
496
pNeighbourInfo_h->_paddingX, \
497
pNeighbourInfo_h->_paddingY, \
498
pNeighbourInfo_h->_paddingZ, \
499
0, 0, stripGridSize, blockSize, \
500
borderPolicyType, outOfImageValue, \
501
pBufIn, pTmpBufData_d); \
502
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
503
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
504
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
505
for (unsigned long z = 0; z < stripSizeZ; ++z) \
506
cudaFree(vBuffers_h[z]); \
507
cudaFree(pTmpBufData_d); \
508
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
509
for (unsigned long curBlockPlanIdx = 0; curBlockPlanIdx < nbBlockPlansInVolume; ++curBlockPlanIdx) { \
510
const unsigned long planOffset = curBlockPlanIdx * blockSizeZ;
511
512
513
// Start the process for the in situ case (the macro starts 2 FOR loops) and define :
514
// - unsigned long volumeIdx : the current volume to process,
515
// - unsigned long curBlockPlanIdx : the plans for the current blocks grid positions to process,
516
// - unsigned long planOffset : the first plans used by the current blocks grid,
517
#define IPSDKCUDA_START_INSITU_DATA_PARSING_3D_WITH_PADDING(paddingX, paddingY, paddingZ) \
518
for (unsigned long volumeIdx = 0; volumeIdx < nbVolumes; ++volumeIdx) { \
519
res = ipsdk::util::gpu::copyInputToStrip3d_knlLauncher(pLauncherInfo, volumeIdx, \
520
stripSizeX, stripSizeY, stripSizeZ, \
521
paddingX, paddingY, paddingZ, \
522
0, 0, stripGridSize, blockSize, \
523
borderPolicyType, outOfImageValue, \
524
pBufIn, pTmpBufData_d); \
525
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
526
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
527
for (unsigned long z = 0; z < stripSizeZ; ++z) \
528
cudaFree(vBuffers_h[z]); \
529
cudaFree(pTmpBufData_d); \
530
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
531
for (unsigned long curBlockPlanIdx = 0; curBlockPlanIdx < nbBlockPlansInVolume; ++curBlockPlanIdx) { \
532
const unsigned long planOffset = curBlockPlanIdx * blockSizeZ;
533
534
// End the process for the in situ case :
535
// - roll up the input data thanks to pTmpBuf_d
536
// - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_2D
537
// - free the temporization buffer pTmpBuf_d
538
#define IPSDKCUDA_END_INSITU_DATA_PARSING_2D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
539
res = ipsdk::util::gpu::stripRollUp2d_knlLauncher(pLauncherInfo, \
540
pNeighbourInfo_h->_paddingX, pNeighbourInfo_h->_paddingY, \
541
stripSizeX, stripSizeY, \
542
stripGridSize, blockSize, \
543
planIdx, (curLineGridIdx + 1) * gridSizeY, \
544
borderPolicyType, outOfImageValue, \
545
pBufIn, pTmpBufData_d); \
546
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
547
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
548
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
549
cudaFree(pTmpBuf_d); \
550
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
551
} \
552
} \
553
cudaFree(pTmpBuf_d);
554
555
// End the process for the in situ case :
556
// - roll up the input data thanks to pTmpBuf_d
557
// - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_2D
558
// - free the temporization buffer pTmpBuf_d
559
#define IPSDKCUDA_END_INSITU_DATA_PARSING_2D_WITH_PADDING(paddingX, paddingY) \
560
res = ipsdk::util::gpu::stripRollUp2d_knlLauncher(pLauncherInfo, \
561
paddingX, paddingY, \
562
stripSizeX, stripSizeY, \
563
stripGridSize, blockSize, \
564
planIdx, (curLineGridIdx + 1) * gridSizeY, \
565
borderPolicyType, outOfImageValue, \
566
pBufIn, pTmpBufData_d); \
567
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
568
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
569
cudaFree(pTmpBuf_d); \
570
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
571
} \
572
} \
573
cudaFree(pTmpBuf_d);
574
575
// End the process for the in situ case :
576
// - roll up the input data thanks to pTmpBuf_d
577
// - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_3D
578
// - free the temporization buffers pTmpBuf_d and vBuffers_h
579
#define IPSDKCUDA_END_INSITU_DATA_PARSING_3D(pNeighbourInfo_h, neighbourInfo_d, FREE_NEIGHBOURBUFFER_MACRO) \
580
res = ipsdk::util::gpu::stripRollUp3d_knlLauncher(pLauncherInfo, \
581
pNeighbourInfo_h->_paddingX, \
582
pNeighbourInfo_h->_paddingY, \
583
pNeighbourInfo_h->_paddingZ, \
584
stripSizeX, stripSizeY, stripSizeZ, \
585
stripGridSize, blockSize, \
586
volumeIdx, planOffset + blockSizeZ, \
587
borderPolicyType, outOfImageValue, \
588
pBufIn, pTmpBufData_d); \
589
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
590
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
591
FREE_NEIGHBOURBUFFER_MACRO(neighbourInfo_d) \
592
for (unsigned long z = 0; z < stripSizeZ; ++z) \
593
cudaFree(vBuffers_h[z]); \
594
cudaFree(pTmpBufData_d); \
595
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
596
} \
597
} \
598
for (unsigned long z = 0; z < stripSizeZ; ++z) \
599
cudaFree(vBuffers_h[z]); \
600
cudaFree(pTmpBufData_d);
601
602
// End the process for the in situ case :
603
// - roll up the input data thanks to pTmpBuf_d
604
// - close the 2 FOR loops started by IPSDKCUDA_START_INSITU_DATA_PARSING_3D
605
// - free the temporization buffers pTmpBuf_d and vBuffers_h
606
#define IPSDKCUDA_END_INSITU_DATA_PARSING_3D_WITH_PADDING(paddingX, paddingY, paddingZ) \
607
res = ipsdk::util::gpu::stripRollUp3d_knlLauncher(pLauncherInfo, \
608
paddingX, paddingY, paddingZ, \
609
stripSizeX, stripSizeY, stripSizeZ, \
610
stripGridSize, blockSize, \
611
volumeIdx, planOffset + blockSizeZ, \
612
borderPolicyType, outOfImageValue, \
613
pBufIn, pTmpBufData_d); \
614
IPSDKCUDA_CHECK_KERNEL_EXEC(res) \
615
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
616
for (unsigned long z = 0; z < stripSizeZ; ++z) \
617
cudaFree(vBuffers_h[z]); \
618
cudaFree(pTmpBufData_d); \
619
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
620
} \
621
} \
622
for (unsigned long z = 0; z < stripSizeZ; ++z) \
623
cudaFree(vBuffers_h[z]); \
624
cudaFree(pTmpBufData_d);
625
626
#endif // __IPSDKIMAGEPROCESSING_IMAGEPROCESSINGALGORITHMSRCMACROS_H__
Generated on Tue Apr 15 2025 16:20:38 for IPSDK by
1.8.14