[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further

pull/13383/head
Anton Obukhov 13 years ago
parent 4378f398c7
commit ebc3043c86
  1. 2
      modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
  2. 56
      modules/gpu/src/nvidia/core/NCVAlg.hpp

@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
}
}
template <NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction>

@ -64,40 +64,56 @@ static T divUp(T a, T b)
template<typename T>
struct functorAddValues
{
static __device__ __inline__ void reduce(T &in1out, T &in2)
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{
in1out += in2;
}
};
template<typename T>
struct functorMinValues
{
static __device__ __inline__ void reduce(T &in1out, T &in2)
{
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{
in1out = in1out > in2 ? in2 : in1out;
}
};
template<typename T>
struct functorMaxValues
{
static __device__ __inline__ void reduce(T &in1out, T &in2)
{
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{
in1out = in1out > in2 ? in1out : in2;
}
};
template<typename Tdata, class Tfunc, Ncv32u nThreads>
static __device__ Tdata subReduce(Tdata threadElem)
{
Tfunc functor;
__shared__ Tdata reduceArr[nThreads];
reduceArr[threadIdx.x] = threadElem;
__shared__ Tdata _reduceArr[nThreads];
volatile Tdata *reduceArr = _reduceArr;
functor.assign(reduceArr + threadIdx.x, &threadElem);
__syncthreads();
if (nThreads >= 256 && threadIdx.x < 128)
@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
}
if (nThreads >= 32)
if (nThreads >= 32 && threadIdx.x < 16)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
}
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
}
__syncthreads();
return reduceArr[0];
Tdata reduceRes;
functor.assign(&reduceRes, reduceArr);
return reduceRes;
}

Loading…
Cancel
Save