diff --git a/modules/gpu/src/opencv2/gpu/device/filters.hpp b/modules/gpu/src/opencv2/gpu/device/filters.hpp
index 9362fde390..537d6aff2f 100644
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
 
         __device__ __forceinline__ elem_type operator ()(float y, float x) const
         {
-            return src(__float2int_rd(y), __float2int_rd(x));
+            return src(__float2int_rn(y), __float2int_rn(x));
         }
 
         const Ptr2D src;
@@ -78,9 +78,6 @@ namespace cv { namespace gpu { namespace device
 
             work_type out = VecTraits<work_type>::all(0);
 
-            x -= 0.5f;
-            y -= 0.5f;
-
             const int x1 = __float2int_rd(x);
             const int y1 = __float2int_rd(y);
             const int x2 = x1 + 1;
@@ -112,24 +109,47 @@ namespace cv { namespace gpu { namespace device
 
         explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
 
-        static __device__ __forceinline__ work_type cubicInterpolate(typename TypeTraits<work_type>::ParameterType p0, typename TypeTraits<work_type>::ParameterType p1, typename TypeTraits<work_type>::ParameterType p2, typename TypeTraits<work_type>::ParameterType p3, float x)
+        static __device__ __forceinline__ float bicubicCoeff(float x_)
         {
-            return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
+            float x = fabsf(x_);
+            if (x <= 1.0f)
+            {
+                return x * x * (1.5f * x - 2.5f) + 1.0f;
+            }
+            else if (x < 2.0f)
+            {
+                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+            }
+            else
+            {
+                return 0.0f;
+            }
         }
 
         __device__ elem_type operator ()(float y, float x) const
         {
-            const int xi = __float2int_rn(x);
-            const int yi = __float2int_rn(y);
+            const float xmin = ::ceilf(x - 2.0f);
+            const float xmax = ::floorf(x + 2.0f);
+
+            const float ymin = ::ceilf(y - 2.0f);
+            const float ymax = ::floorf(y + 2.0f);
+
+            work_type sum = VecTraits<work_type>::all(0);
+            float wsum = 0.0f;
 
-            work_type arr[4];
+            for (float cy = ymin; cy <= ymax; cy += 1.0f)
+            {
+                for (float cx = xmin; cx <= xmax; cx += 1.0f)
+                {
+                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
+                    wsum += w;
+                }
+            }
 
-            arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 2, xi - 2)), saturate_cast<work_type>(src(yi - 2, xi - 1)), saturate_cast<work_type>(src(yi - 2, xi)), saturate_cast<work_type>(src(yi - 2, xi + 1)), (x - xi + 2.0f) / 4.0f);
-            arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 2)), saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), (x - xi + 2.0f) / 4.0f);
-            arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 2)), saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), (x - xi + 2.0f) / 4.0f);
-            arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 2)), saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), (x - xi + 2.0f) / 4.0f);
+            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
 
-            return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], (y - yi + 2.0f) / 4.0f));
+            return saturate_cast<elem_type>(res);
         }
 
         const Ptr2D src;
diff --git a/modules/gpu/test/interpolation.hpp b/modules/gpu/test/interpolation.hpp
index 995b91e19b..e38dc4c969 100644
--- a/modules/gpu/test/interpolation.hpp
+++ b/modules/gpu/test/interpolation.hpp
@@ -54,7 +54,7 @@ template <typename T> struct NearestInterpolator
 {
     static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
     {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
+        return readVal<T>(src, cvRound(y), cvRound(x), c, border_type, borderVal);
     }
 };
 
@@ -62,9 +62,6 @@ template <typename T> struct LinearInterpolator
 {
     static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
     {
-        x -= 0.5f;
-        y -= 0.5f;
-
         int x1 = cvFloor(x);
         int y1 = cvFloor(y);
         int x2 = x1 + 1;
@@ -83,37 +80,47 @@ template <typename T> struct LinearInterpolator
 
 template <typename T> struct CubicInterpolator
 {
-    static float getValue(float p[4], float x)
+    static float bicubicCoeff(float x_)
     {
-        return static_cast<float>(p[1] + 0.5 * x * (p[2] - p[0] + x*(2.0*p[0] - 5.0*p[1] + 4.0*p[2] - p[3] + x*(3.0*(p[1] - p[2]) + p[3] - p[0]))));
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
     }
 
-    static float getValue(float p[4][4], float x, float y)
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
     {
-        float arr[4];
+        const float xmin = ceilf(x - 2.0f);
+        const float xmax = floorf(x + 2.0f);
 
-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
+        const float ymin = ceilf(y - 2.0f);
+        const float ymax = floorf(y + 2.0f);
 
-        return getValue(arr, y);
-    }
+        float sum  = 0.0f;
+        float wsum = 0.0f;
 
-    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
-
-        float vals[4][4] =
+        for (float cy = ymin; cy <= ymax; cy += 1.0f)
         {
-            {(float)readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), (float)readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), (float)readVal<T>(src, iy - 2, ix, c, border_type, borderVal), (float)readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {(float)readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), (float)readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), (float)readVal<T>(src, iy - 1, ix, c, border_type, borderVal), (float)readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {(float)readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), (float)readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), (float)readVal<T>(src, iy    , ix, c, border_type, borderVal), (float)readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {(float)readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), (float)readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), (float)readVal<T>(src, iy + 1, ix, c, border_type, borderVal), (float)readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
+            for (float cx = xmin; cx <= xmax; cx += 1.0f)
+            {
+                const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                sum += w * readVal<T>(src, cvFloor(cy), cvFloor(cx), c, border_type, borderVal);
+                wsum += w;
+            }
+        }
 
-        return cv::saturate_cast<T>(getValue(vals, static_cast<float>((x - ix + 2.0) / 4.0), static_cast<float>((y - iy + 2.0) / 4.0)));
+        float res = (!wsum)? 0 : sum / wsum;
+
+        return cv::saturate_cast<T>(res);
     }
 };
 
diff --git a/modules/gpu/test/test_remap.cpp b/modules/gpu/test/test_remap.cpp
index 84fde5adba..c61a899142 100644
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@@ -163,7 +163,7 @@ TEST_P(Remap, Accuracy)
     cv::Mat dst_gold;
     remapGold(src, xmap, ymap, dst_gold, interpolation, borderType, val);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(