From 84022779a1c97e33d0f1b1f43e0d2ea6c4a13405 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Tue, 6 Sep 2011 07:01:23 +0000
Subject: [PATCH] added bicubic interpolation to gpu::remap

---
 modules/gpu/src/cuda/brute_force_matcher.cu   |   2 +-
 modules/gpu/src/cuda/imgproc.cu               |  11 +-
 modules/gpu/src/cuda/surf.cu                  |   1 +
 modules/gpu/src/imgproc.cpp                   |   2 +-
 .../gpu/src/opencv2/gpu/device/filters.hpp    | 135 +++++++++++
 .../gpu/src/opencv2/gpu/device/utility.hpp    | 228 ------------------
 .../src/opencv2/gpu/device/vec_distance.hpp   | 223 +++++++++++++++++
 modules/gpu/test/test_imgproc.cpp             |  26 +-
 samples/gpu/performance/tests.cpp             |  19 +-
 9 files changed, 385 insertions(+), 262 deletions(-)
 create mode 100644 modules/gpu/src/opencv2/gpu/device/filters.hpp
 create mode 100644 modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
index efe3510149..f5afda75b5 100644
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -42,7 +42,7 @@
 
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 16fbb2b2ab..537364172b 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -45,7 +45,7 @@
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/filters.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -186,7 +186,7 @@ namespace cv { namespace gpu { namespace imgproc
     {
         typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
 
-        static const caller_t callers[2][5] = 
+        static const caller_t callers[3][5] = 
         {
             { 
                 RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
@@ -201,6 +201,13 @@ namespace cv { namespace gpu { namespace imgproc
                 RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
                 RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
                 RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+            },
+            { 
+                RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
+                RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
+                RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
+                RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
+                RemapDispatcher<CubicFilter, BrdWrap, T>::call 
             }
         };
 
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 0f4d0611bf..ef344964d0 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -50,6 +50,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/filters.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 8c6c59013f..19ac1a1514 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -131,7 +131,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
     CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
 
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 
     CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
     int gpuBorderType;
diff --git a/modules/gpu/src/opencv2/gpu/device/filters.hpp b/modules/gpu/src/opencv2/gpu/device/filters.hpp
new file mode 100644
index 0000000000..2f8d012117
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_FILTERS_HPP__
+#define __OPENCV_GPU_FILTERS_HPP__
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+
+namespace cv {  namespace gpu { namespace device
+{
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+         
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rn(y), __float2int_rn(x));
+        }
+
+        const Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        const Ptr2D src;
+    };
+    
+    template <typename Ptr2D> struct CubicFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
+        
+        static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
+        {
+	        return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const int xi = __float2int_rn(x);
+            const int yi = __float2int_rn(y);
+            
+	        work_type arr[4];
+	        
+	        arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
+	        arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
+	        arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
+	        arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
+	        
+	        return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
+        }
+
+        const Ptr2D src;
+    };
+}}}
+
+#endif // __OPENCV_GPU_FILTERS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
index 67300b4e1f..5b49c4d516 100644
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -135,180 +135,6 @@ namespace cv {  namespace gpu { namespace device
         StaticAssert<n >= 8 && n <= 512>::check();
         detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
     }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Vector Distance
-
-    template <typename T> struct L1Dist
-    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum = __sad(val1, val2, mySum);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-    template <> struct L1Dist<float>
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            mySum += ::fabs(val1 - val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return mySum;
-        }
-
-        float mySum;
-    };
-
-    struct L2Dist
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            float reg = val1 - val2;
-            mySum += reg * reg;
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return sqrtf(mySum);
-        }
-
-        float mySum;
-    };
-
-    struct HammingDist
-    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ HammingDist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum += __popc(val1 ^ val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-
-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
-    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {
-        for (int i = tid; i < len; i += THREAD_DIM)
-        {
-            T1 val1;
-            ForceGlob<T1>::Load(vec1, i, val1);
-
-            T2 val2;
-            ForceGlob<T2>::Load(vec2, i, val2);
-
-            dist.reduceIter(val1, val2);
-        }
-
-        dist.reduceAll<THREAD_DIM>(smem, tid);
-    }
-
-    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
-    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {        
-        detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-        
-        dist.reduceAll<THREAD_DIM>(smem, tid);
-    }
-
-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
-    {
-        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
-        {
-            vec1 = vec1_;
-        }
-
-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
-        }
-
-        const T1* vec1;
-    };
-
-    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
-    {
-        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
-        {
-            if (glob_tid < len)
-                smem[glob_tid] = vec1[glob_tid];
-            __syncthreads();
-
-            U* vec1ValsPtr = vec1Vals;
-
-            #pragma unroll
-            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
-                *vec1ValsPtr++ = smem[i];
-
-            __syncthreads();
-        }
-
-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
-        }
-
-        U vec1Vals[MAX_LEN / THREAD_DIM];
-    };
     
     ///////////////////////////////////////////////////////////////////////////////
     // Solve linear system
@@ -363,60 +189,6 @@ namespace cv {  namespace gpu { namespace device
 
         return false;
     }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Filters    
-
-    template <typename Ptr2D> struct PointFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
-         
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            return src(__float2int_rn(y), __float2int_rn(x));
-        }
-
-        const Ptr2D src;
-    };
-
-    template <typename Ptr2D> struct LinearFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
-
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-
-            work_type out = VecTraits<work_type>::all(0);
-
-            const int x1 = __float2int_rd(x);
-            const int y1 = __float2int_rd(y);
-            const int x2 = x1 + 1;
-            const int y2 = y1 + 1;
-
-            elem_type src_reg = src(y1, x1);
-            out = out + src_reg * ((x2 - x) * (y2 - y));
-
-            src_reg = src(y1, x2);
-            out = out + src_reg * ((x - x1) * (y2 - y));
-
-            src_reg = src(y2, x1);
-            out = out + src_reg * ((x2 - x) * (y - y1));
-
-            src_reg = src(y2, x2);
-            out = out + src_reg * ((x - x1) * (y - y1));
-
-            return saturate_cast<elem_type>(out);
-        }
-
-        const Ptr2D src;
-    };
 }}}
 
 #endif // __OPENCV_GPU_UTILITY_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
new file mode 100644
index 0000000000..064b74b17b
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -0,0 +1,223 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
+#define __OPENCV_GPU_VEC_DISTANCE_HPP__
+
+#include "utility.hpp"
+
+namespace cv {  namespace gpu { namespace device
+{
+
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {        
+        detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+        
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}}
+
+#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index b6d132360d..094ed9aa1f 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -210,20 +210,8 @@ struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int,
 
         src = cvtest::randomMat(rng, size, type, 0.0, 256.0, false);
 
-        xmap.create(size, CV_32FC1);
-        ymap.create(size, CV_32FC1);
-
-        for (int y = 0; y < src.rows; ++y)
-        {
-            float* xmap_row = xmap.ptr<float>(y);
-            float* ymap_row = ymap.ptr<float>(y);
-
-            for (int x = 0; x < src.cols; ++x)
-            {
-                xmap_row[x] = src.cols - 1 - x + 10;
-                ymap_row[x] = src.rows - 1 - y + 10;
-            }
-        }
+        xmap = cvtest::randomMat(rng, size, CV_32FC1, -20.0, src.cols + 20, false);
+        ymap = cvtest::randomMat(rng, size, CV_32FC1, -20.0, src.rows + 20, false);
         
         cv::remap(src, dst_gold, xmap, ymap, interpolation, borderType);
     }
@@ -253,13 +241,7 @@ TEST_P(Remap, Accuracy)
         gpuRes.download(dst);
     );
 
-    if (dst_gold.depth() == CV_32F)
-    {
-        dst_gold.convertTo(dst_gold, CV_8U);
-        dst.convertTo(dst, CV_8U);
-    }
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-1);
 }
 
 INSTANTIATE_TEST_CASE_P
@@ -272,7 +254,7 @@ INSTANTIATE_TEST_CASE_P
             CV_8UC1, CV_8UC3, CV_8UC4,
             CV_32FC1, CV_32FC3, CV_32FC4
         ),
-        testing::Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
+        testing::Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR, (int)cv::INTER_CUBIC),
         testing::Values((int)cv::BORDER_REFLECT101, (int)cv::BORDER_REPLICATE, (int)cv::BORDER_CONSTANT, (int)cv::BORDER_REFLECT, (int)cv::BORDER_WRAP)
     )
 );
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index d4ea5e6e3a..e03988be38 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -78,6 +78,9 @@ TEST(remap)
 {
     Mat src, dst, xmap, ymap;
     gpu::GpuMat d_src, d_dst, d_xmap, d_ymap;
+    
+    int interpolation = INTER_LINEAR;
+    int borderMode = BORDER_CONSTANT;
 
     for (int size = 1000; size <= 4000; size *= 2)
     {
@@ -101,7 +104,7 @@ TEST(remap)
         dst.create(xmap.size(), src.type());
 
         CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        remap(src, dst, xmap, ymap, interpolation, borderMode);
         CPU_OFF;
 
         d_src = src;
@@ -110,7 +113,7 @@ TEST(remap)
         d_dst.create(d_xmap.size(), d_src.type());
 
         GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
         GPU_OFF;
     }
 
@@ -136,7 +139,7 @@ TEST(remap)
         dst.create(xmap.size(), src.type());
 
         CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        remap(src, dst, xmap, ymap, interpolation, borderMode);
         CPU_OFF;
 
         d_src = src;
@@ -145,7 +148,7 @@ TEST(remap)
         d_dst.create(d_xmap.size(), d_src.type());
 
         GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
         GPU_OFF;
     }
 
@@ -171,7 +174,7 @@ TEST(remap)
         dst.create(xmap.size(), src.type());
 
         CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        remap(src, dst, xmap, ymap, interpolation, borderMode);
         CPU_OFF;
 
         d_src = src;
@@ -180,7 +183,7 @@ TEST(remap)
         d_dst.create(d_xmap.size(), d_src.type());
 
         GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
         GPU_OFF;
     }
 
@@ -206,7 +209,7 @@ TEST(remap)
         dst.create(xmap.size(), src.type());
 
         CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        remap(src, dst, xmap, ymap, interpolation, borderMode);
         CPU_OFF;
 
         d_src = src;
@@ -215,7 +218,7 @@ TEST(remap)
         d_dst.create(d_xmap.size(), d_src.type());
 
         GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
         GPU_OFF;
     }
 }