From da9be8231fc153fd70ac4f4d41091d1653d00fd2 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Fri, 1 Aug 2014 11:33:29 +0400
Subject: [PATCH] fix cv::gpu::resize for INTER_LINEAR, now it produces the
 same result as CPU version

---
 modules/gpu/src/cuda/resize.cu   | 34 +++++++++++++++++-------
 modules/gpu/test/test_resize.cpp | 44 +++++++++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 16 deletions(-)
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index fa13121f96..1998b3b07c 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -77,8 +77,8 @@ namespace cv { namespace gpu { namespace device
 
         if (dst_x < dst.cols && dst_y < dst.rows)
         {
-            const float src_x = dst_x * fx;
-            const float src_y = dst_y * fy;
+            const float src_x = (dst_x + 0.5f) * fx - 0.5f;
+            const float src_y = (dst_y + 0.5f) * fy - 0.5f;
 
             work_type out = VecTraits<work_type>::all(0);
 
@@ -86,16 +86,18 @@ namespace cv { namespace gpu { namespace device
             const int y1 = __float2int_rd(src_y);
             const int x2 = x1 + 1;
             const int y2 = y1 + 1;
-            const int x2_read = ::min(x2, src.cols - 1);
-            const int y2_read = ::min(y2, src.rows - 1);
+            const int x1_read = ::max(::min(x1, src.cols - 1), 0);
+            const int y1_read = ::max(::min(y1, src.rows - 1), 0);
+            const int x2_read = ::max(::min(x2, src.cols - 1), 0);
+            const int y2_read = ::max(::min(y2, src.rows - 1), 0);
 
-            T src_reg = src(y1, x1);
+            T src_reg = src(y1_read, x1_read);
             out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
 
-            src_reg = src(y1, x2_read);
+            src_reg = src(y1_read, x2_read);
             out = out + src_reg * ((src_x - x1) * (y2 - src_y));
 
-            src_reg = src(y2_read, x1);
+            src_reg = src(y2_read, x1_read);
             out = out + src_reg * ((x2 - src_x) * (src_y - y1));
 
             src_reg = src(y2_read, x2_read);
@@ -119,6 +121,20 @@ namespace cv { namespace gpu { namespace device
         }
     }
 
+    template <class Ptr2D, typename T> __global__ void resize_linear(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = (dst_x + 0.5f) * fx - 0.5f;
+            const float src_y = (dst_y + 0.5f) * fy - 0.5f;
+
+            dst(dst_y, dst_x) = src(src_y, src_x);
+        }
+    }
+
     template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, PtrStepSz<T> dst)
     {
         const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -231,7 +247,7 @@ namespace cv { namespace gpu { namespace device
             TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
             LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
 
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            resize_linear<<<grid, block>>>(filteredSrc, dst, fy, fx);
         }
         else
         {
@@ -241,7 +257,7 @@ namespace cv { namespace gpu { namespace device
             BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
             LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
 
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            resize_linear<<<grid, block>>>(filteredSrc, dst, fy, fx);
         }
 
         cudaSafeCall( cudaGetLastError() );
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
index 88e6b1cab7..460e50c2b4 100644
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -73,6 +73,28 @@ namespace
         }
     }
 
+    template <typename T, template <typename> class Interpolator>
+    void resizeLinearImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
+
+        dst.create(dsize, src.type());
+
+        float ifx = static_cast<float>(1.0 / fx);
+        float ify = static_cast<float>(1.0 / fy);
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, (y + 0.5f) * ify - 0.5f, (x + 0.5f) * ifx - 0.5f, c, cv::BORDER_REPLICATE);
+            }
+        }
+    }
+
     void resizeGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy, int interpolation)
     {
         typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
@@ -90,12 +112,12 @@ namespace
 
         static const func_t linear_funcs[] =
         {
-            resizeImpl<unsigned char, LinearInterpolator>,
-            resizeImpl<signed char, LinearInterpolator>,
-            resizeImpl<unsigned short, LinearInterpolator>,
-            resizeImpl<short, LinearInterpolator>,
-            resizeImpl<int, LinearInterpolator>,
-            resizeImpl<float, LinearInterpolator>
+            resizeLinearImpl<unsigned char, LinearInterpolator>,
+            resizeLinearImpl<signed char, LinearInterpolator>,
+            resizeLinearImpl<unsigned short, LinearInterpolator>,
+            resizeLinearImpl<short, LinearInterpolator>,
+            resizeLinearImpl<int, LinearInterpolator>,
+            resizeLinearImpl<float, LinearInterpolator>
         };
 
         static const func_t cubic_funcs[] =
@@ -203,7 +225,15 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
     testing::Values(0.3, 0.5),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
+    WHOLE_SUBMAT));
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
     WHOLE_SUBMAT));
 
 #endif // HAVE_CUDA