diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
index d317b2c449..fd22d2a317 100644
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -157,43 +157,6 @@ CV_CUDEV_SHFL_VEC_INST(double)
 
 // shfl_up
 
-template <typename T>
-__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
-{
-#if __CUDACC_VER_MAJOR__ < 9
-
-    return shfl_up(val, delta, width);
-
-#else // __CUDACC_VER_MAJOR__ < 9
-
-#if CV_CUDEV_ARCH >= 700
-    return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
-#else
-    const int block_size = Block::blockSize();
-    const int residual = block_size & (warpSize - 1);
-
-    if (0 == residual)
-        return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
-    else
-    {
-        const int n_warps = divUp(block_size, warpSize);
-        const int warp_id = Warp::warpId();
-
-        if (warp_id < n_warps - 1)
-            return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
-        else
-        {
-            // We are at the last threads of a block whose number of threads
-            // is not a multiple of the warp size
-            uint mask = (1LU << residual) - 1;
-            return shfl_up_sync(mask, val, delta, width);
-        }
-    }
-#endif
-
-#endif // __CUDACC_VER_MAJOR__ < 9
-}
-
 #if __CUDACC_VER_MAJOR__ >= 9
 
 template <typename T>
@@ -300,6 +263,43 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
 
 #endif
 
+template <typename T>
+__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
+{
+#if __CUDACC_VER_MAJOR__ < 9
+
+    return shfl_up(val, delta, width);
+
+#else // __CUDACC_VER_MAJOR__ < 9
+
+#if CV_CUDEV_ARCH >= 700
+    return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+#else
+    const int block_size = Block::blockSize();
+    const int residual = block_size & (warpSize - 1);
+
+    if (0 == residual)
+        return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+    else
+    {
+        const int n_warps = divUp(block_size, warpSize);
+        const int warp_id = Warp::warpId();
+
+        if (warp_id < n_warps - 1)
+            return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+        else
+        {
+            // We are at the last threads of a block whose number of threads
+            // is not a multiple of the warp size
+            uint mask = (1LU << residual) - 1;
+            return shfl_up_sync(mask, val, delta, width);
+        }
+    }
+#endif
+
+#endif // __CUDACC_VER_MAJOR__ < 9
+}
+
 // shfl_down
 
 __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)