From be0c20b7582bbe89fdbd5b611d6d0ea657294e40 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sat, 24 Nov 2012 01:55:03 +0400 Subject: [PATCH] align grid by 4 --- modules/gpu/src/cuda/integral_image.cu | 13 ++++++++++--- modules/gpu/src/imgproc.cpp | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 558f9085db..a34a52a313 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device #endif } - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream) + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream) { { // each thread handles 16 values, use 1 block/row - const int block = img.cols / 16; + int block = img.cols / 16; + + // save, becouse step is actually can't be less 512 bytes + int align = img.cols % 4; + if ( align != 0) + { + block += (4 - align); + } // launch 1 block / row const int grid = img.rows; cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); - shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); + shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81c2e..81a2248fdb 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream); + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream); } }}}