[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further

13 years ago · ebc3043c86
parent 4378f398c7
commit ebc3043c86
2 changed files with 38 additions and 20 deletions
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
    }
 }

-
+
 template <NcvBool tbCacheTextureIImg,
          NcvBool tbCacheTextureCascade,
          NcvBool tbDoAtomicCompaction>
--- a/modules/gpu/src/nvidia/core/NCVAlg.hpp
+++ b/modules/gpu/src/nvidia/core/NCVAlg.hpp
@ -64,40 +64,56 @@ static T divUp(T a, T b)
 template<typename T>
 struct functorAddValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
    {
        in1out += in2;
    }
 };
-
-
+
+
 template<typename T>
 struct functorMinValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
-    {
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
        in1out = in1out > in2 ? in2 : in1out;
    }
 };
-
-
+
+
 template<typename T>
 struct functorMaxValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
-    {
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
        in1out = in1out > in2 ? in1out : in2;
    }
 };
-
-
+
+
 template<typename Tdata, class Tfunc, Ncv32u nThreads>
 static __device__ Tdata subReduce(Tdata threadElem)
 {
    Tfunc functor;

-    __shared__ Tdata reduceArr[nThreads];
-    reduceArr[threadIdx.x] = threadElem;
+    __shared__ Tdata _reduceArr[nThreads];
+    volatile Tdata *reduceArr = _reduceArr;
+    functor.assign(reduceArr + threadIdx.x, &threadElem);
    __syncthreads();

    if (nThreads >= 256 && threadIdx.x < 128)
@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
        {
            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
        }
-        if (nThreads >= 32)
+        if (nThreads >= 32 && threadIdx.x < 16)
        {
            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
        }
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
    }

    __syncthreads();
-    return reduceArr[0];
+    Tdata reduceRes;
+    functor.assign(&reduceRes, reduceArr);
+    return reduceRes;
 }