const unsigned long nbBlocksInGrid = pLauncherInfo->_nbMultiProcessors; \
unsigned long nbLineBlocksPerGrid = nbBlocksInGrid / nbBlocksX + (nbBlocksInGrid % nbBlocksX != 0); \
nbLineBlocksPerGrid = (nbLineBlocksPerGrid < sizeY / blockSizeY ? nbLineBlocksPerGrid : sizeY / blockSizeY); \
const unsigned long gridSizeY = blockSizeY * nbLineBlocksPerGrid; \
const unsigned long nbLineGridsInImg = sizeY / gridSizeY + (sizeY % gridSizeY != 0); \
const unsigned long stripSizeX = sizeX + 2 * neighbourInfo_d._paddingX; \
const unsigned long stripSizeY = blockSizeY * nbLineBlocksPerGrid + 2 * neighbourInfo_d._paddingY; \
const unsigned long nbTmpData = stripSizeX * stripSizeY; \
T* pBuffer_d; \
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&(pBuffer_d), nbTmpData * sizeof(T))); \
T** pTmpBufData_d; \
IPSDKCUDA_CHECK_ERROR(cudaMalloc((void**)&pTmpBufData_d, sizeof(T*))); \
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
cudaFree(pBuffer_d); \
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
IPSDKCUDA_CHECK_ERROR(cudaMemcpy(pTmpBufData_d, &pBuffer_d, sizeof(T*), cudaMemcpyHostToDevice)); \
IPSDKCUDA_START_FREE_MEMORY_IF_ERROR() \
IPSDKCUDA_FREE_SE_INFO_2D(neighbourInfo_d) \
cudaFree(pBuffer_d); \
cudaFree(pTmpBufData_d); \
IPSDKCUDA_END_FREE_MEMORY_IF_ERROR() \
const T** pTmpBuf_d = const_cast<const T**>(pTmpBufData_d); \
const unsigned long nbBlocksX_strip = stripSizeX / nbThreadsPerBlockDim + (stripSizeX % nbThreadsPerBlockDim == 0 ? 0 : 1); \
const unsigned long nbBlocksY_strip = stripSizeY / nbThreadsPerBlockDim + (stripSizeY % nbThreadsPerBlockDim == 0 ? 0 : 1); \
const unsigned long nbBlocksZ_strip = 1; \
dim3 stripGridSize(nbBlocksX_strip, nbBlocksY_strip, nbBlocksZ_strip);