From f1e44fa5caf4baaeafc8b7c5c43dc4d55ffa1229 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 14 May 2014 12:48:12 +0400
Subject: [PATCH] fix bug #3678 (cuda::integral failures)

---
 modules/cudaarithm/test/test_reductions.cpp         |  2 +-
 .../include/opencv2/cudev/grid/detail/integral.hpp  | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp
index 5fd7e2dec9..e3c54055df 100644
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -850,7 +850,7 @@ CUDA_TEST_P(Integral, Accuracy)
 
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
     ALL_DEVICES,
-    DIFFERENT_SIZES,
+    testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(768, 1066)),
     WHOLE_SUBMAT));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
index 5c90e99893..7af52650c0 100644
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -439,8 +439,6 @@ namespace integral_detail
 
             T sum = (tidx < cols) && (y < rows) ? *p : 0;
 
-            y += blockDim.y;
-
             sums[threadIdx.x][threadIdx.y] = sum;
             __syncthreads();
 
@@ -467,14 +465,17 @@ namespace integral_detail
             if (threadIdx.y > 0)
                 sum += sums[threadIdx.x][threadIdx.y - 1];
 
-            if (tidx < cols)
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            if ((tidx < cols) && (y < rows))
             {
-                sum += stepSum;
-                stepSum += sums[threadIdx.x][blockDim.y - 1];
                 *p = sum;
             }
 
-            __syncthreads();
+            y += blockDim.y;
         }
     #else
         __shared__ T smem[32][32];