diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp index d317b2c449..fd22d2a317 100644 --- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp @@ -157,43 +157,6 @@ CV_CUDEV_SHFL_VEC_INST(double) // shfl_up -template -__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize) -{ -#if __CUDACC_VER_MAJOR__ < 9 - - return shfl_up(val, delta, width); - -#else // __CUDACC_VER_MAJOR__ < 9 - -#if CV_CUDEV_ARCH >= 700 - return shfl_up_sync(0xFFFFFFFFU, val, delta, width); -#else - const int block_size = Block::blockSize(); - const int residual = block_size & (warpSize - 1); - - if (0 == residual) - return shfl_up_sync(0xFFFFFFFFU, val, delta, width); - else - { - const int n_warps = divUp(block_size, warpSize); - const int warp_id = Warp::warpId(); - - if (warp_id < n_warps - 1) - return shfl_up_sync(0xFFFFFFFFU, val, delta, width); - else - { - // We are at the last threads of a block whose number of threads - // is not a multiple of the warp size - uint mask = (1LU << residual) - 1; - return shfl_up_sync(mask, val, delta, width); - } - } -#endif - -#endif // __CUDACC_VER_MAJOR__ < 9 -} - #if __CUDACC_VER_MAJOR__ >= 9 template @@ -300,6 +263,43 @@ CV_CUDEV_SHFL_UP_VEC_INST(double) #endif +template +__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize) +{ +#if __CUDACC_VER_MAJOR__ < 9 + + return shfl_up(val, delta, width); + +#else // __CUDACC_VER_MAJOR__ < 9 + +#if CV_CUDEV_ARCH >= 700 + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); +#else + const int block_size = Block::blockSize(); + const int residual = block_size & (warpSize - 1); + + if (0 == residual) + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); + else + { + const int n_warps = divUp(block_size, warpSize); + const int warp_id = Warp::warpId(); + + if (warp_id < n_warps - 1) + return shfl_up_sync(0xFFFFFFFFU, val, delta, width); + else + { + // We are at the last threads of a block whose number of threads + // is not a multiple of the warp size + uint mask = (1LU << residual) - 1; + return shfl_up_sync(mask, val, delta, width); + } + } +#endif + +#endif // __CUDACC_VER_MAJOR__ < 9 +} + // shfl_down __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)