diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu index e0c766a3b6..2c6f65727e 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -59,6 +59,7 @@ #include #include "NCV.hpp" +#include "NCVAlg.hpp" #include "NPP_staging/NPP_staging.hpp" #include "NCVRuntimeTemplates.hpp" #include "NCVHaarObjectDetection.hpp" @@ -84,11 +85,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) pos += K_WARP_SIZE; s_Data[pos] = idata; - //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) - //{ - // s_Data[pos] += s_Data[pos - offset]; - //} - s_Data[pos] += s_Data[pos - 1]; s_Data[pos] += s_Data[pos - 2]; s_Data[pos] += s_Data[pos - 4]; @@ -234,60 +230,6 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) } -__device__ Ncv32f reduceSpecialization(Ncv32f partialSum) -{ - __shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL]; - reductor[threadIdx.x] = partialSum; - __syncthreads(); - -#if defined CPU_FP_COMPLIANCE - if (!threadIdx.x) - { - Ncv32f sum = 0.0f; - for (int i=0; i= 8 - if (threadIdx.x < 128) - { - reductor[threadIdx.x] += reductor[threadIdx.x + 128]; - } - __syncthreads(); -#endif -#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7 - if (threadIdx.x < 64) - { - reductor[threadIdx.x] += reductor[threadIdx.x + 64]; - } - __syncthreads(); -#endif - - if (threadIdx.x < 32) - { -#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6 - reductor[threadIdx.x] += reductor[threadIdx.x + 32]; -#endif -#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5 - reductor[threadIdx.x] += reductor[threadIdx.x + 16]; -#endif - reductor[threadIdx.x] += reductor[threadIdx.x + 8]; - reductor[threadIdx.x] += reductor[threadIdx.x + 4]; - reductor[threadIdx.x] += reductor[threadIdx.x + 2]; - reductor[threadIdx.x] += reductor[threadIdx.x + 1]; - } -#endif - - __syncthreads(); - - return reductor[0]; -} - - __device__ Ncv32u d_outMaskPosition; @@ -623,7 +565,14 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL; } - Ncv32f finalStageSum = reduceSpecialization(curStageSum); + struct functorAddValues + { + __device__ void reduce(Ncv32f &in1out, Ncv32f &in2) + { + in1out += in2; + } + }; + Ncv32f finalStageSum = subReduce(curStageSum); if (finalStageSum < stageThreshold) { diff --git a/modules/gpu/src/nvidia/core/NCVAlg.hpp b/modules/gpu/src/nvidia/core/NCVAlg.hpp new file mode 100644 index 0000000000..0dda7c1dcc --- /dev/null +++ b/modules/gpu/src/nvidia/core/NCVAlg.hpp @@ -0,0 +1,97 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _ncv_alg_hpp_ +#define _ncv_alg_hpp_ + +#include "NCV.hpp" + + +template +static T divUp(T a, T b) +{ + return (a + b - 1) / b; +} + + +template +static __device__ Tdata subReduce(Tdata threadElem) +{ + Tfunc functor; + + __shared__ Tdata reduceArr[nThreads]; + reduceArr[threadIdx.x] = threadElem; + __syncthreads(); + + if (nThreads >= 256 && threadIdx.x < 128) + { + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]); + } + __syncthreads(); + + if (nThreads >= 128 && threadIdx.x < 64) + { + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]); + } + __syncthreads(); + + if (threadIdx.x < 32) + { + if (nThreads >= 64) + { + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]); + } + if (nThreads >= 32) + { + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]); + } + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]); + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]); + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]); + functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]); + } + + __syncthreads(); + return reduceArr[0]; +} + + +#endif //_ncv_alg_hpp_