cudev - Rework some code

- Use shfl_down, instead of __shfl_down, on warp scan - Remove race conditions
6 years ago · 2b6be3cb0f
parent 4c94804bb0
commit 2b6be3cb0f
4 changed files with 25 additions and 4 deletions
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
            }
            else
            {
+                // Read from smem[tid]              (T val = smem[tid])
+                // and write to smem[tid + 1]       (smem[tid + 1] = warpScanInclusive(mask, val))
+                // should be explicitly fenced by "__syncwarp" to get rid of
+                // "cuda-memcheck --tool racecheck" warnings.
+                __syncwarp(mask);
+
                // calculate inclusive scan and write back to shared memory with offset 1
                smem[tid + 1] = warpScanInclusive(mask, val);

@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)

        int quot = THREADS_NUM / WARP_SIZE;

+        T val;
+
        if (tid < quot)
        {
            // grab top warp elements
-            T val = smem[tid];
+            val = smem[tid];
+        }
+
+        __syncthreads();
+
+        if (tid < quot)
+        {

            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
            {
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@ -63,7 +63,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;

-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;

        __syncthreads();

@ -105,7 +106,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;

-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;

        __syncthreads();

--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    #pragma unroll
    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
    {
-        const T val = __shfl_up(data, i, WARP_SIZE);
+        const T val = shfl_up(data, i);
        if (laneId >= i)
              data += val;
    }
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
    return __hiloint2double(hi, lo);
 }

+__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
+{
+    return __shfl_up(val, delta, width);
+}
+
 #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
    __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
    { \