From be0c20b7582bbe89fdbd5b611d6d0ea657294e40 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 24 Nov 2012 01:55:03 +0400
Subject: [PATCH] align grid by 4

---
 modules/gpu/src/cuda/integral_image.cu | 13 ++++++++++---
 modules/gpu/src/imgproc.cpp            |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index 558f9085db..a34a52a313 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device
         #endif
         }
 
-        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
         {
             {
                 // each thread handles 16 values, use 1 block/row
-                const int block = img.cols / 16;
+                int block = img.cols / 16;
+
+                // save, becouse step is actually can't be less 512 bytes
+                int align = img.cols % 4;
+                if ( align != 0)
+                {
+                    block += (4 - align);
+                }
 
                 // launch 1 block / row
                 const int grid = img.rows;
 
                 cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
 
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
                 cudaSafeCall( cudaGetLastError() );
             }
 
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 0bf9c81c2e..81a2248fdb 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
 {
     namespace imgproc
     {
-        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
     }
 }}}