minor LBP for GPU

13 years ago · a2430afcac
parent 5120d690f0
commit a2430afcac
4 changed files with 119 additions and 87 deletions
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
                                DevMem2D_<int4> objects,
                                unsigned int* classified);

-        int connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
        void bindIntegral(DevMem2Di integral);
        void unbindIntegral();
    }
@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);

    const int defaultObjSearchNum = 100;
-    const float grouping_eps = 0.2;
+    const float grouping_eps = 0.2f;

    if( !objects.empty() && objects.depth() == CV_32S)
        objects.reshape(4, 1);
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device

        struct Classifier
        {
-            __host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
-                const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
-            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
-              scale(_scale), step(_step), subsetSize(_subsetSize){}
+            __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, 
+				const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
+            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), 
+			  clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}

            __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
            {
@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
                rect.z = clWidth;
                rect.w = clHeight;

-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+#if (__CUDA_ARCH__ < 120)
                int res = __atomicInc(n, maxN);
 #else
                int res = atomicInc(n, maxN);
@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device
            extern __shared__ int sbuff[];

            int* labels = sbuff;
-            int* rrects = (int*)(sbuff + n);
+            int* rrects = sbuff + n;

            Pr predicate(grouping_eps);
            partition(candidates, n, labels, predicate);
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();

            int cls = labels[tid];
-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+#if (__CUDA_ARCH__ < 120)
            __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
            __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
            __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
            labels[tid] = 0;

            __syncthreads();
-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+#if (__CUDA_ARCH__ < 120)
            __atomicInc((unsigned int*)labels + cls, n);
 #else
            atomicInc((unsigned int*)labels + cls, n);
@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device

            if (active && active >= groupThreshold)
            {
-                int* r1 = rrects + tid * 4;
-                int4 r_out;
-                r_out.x = r1[0];
-                r_out.y = r1[1];
-                r_out.z = r1[2];
-                r_out.w = r1[3];
-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                int* r1 = rrects + tid * 4;                				
+				int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+
+#if (__CUDA_ARCH__ < 120)
                objects[__atomicInc(nclasses, n)] = r_out;
 #else
                int aidx = atomicInc(nclasses, n);
@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device

        void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
-        {
-            const int THREADS_BLOCK = 256;
-            int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
-            int blocks  = divUp(work_amount, THREADS_BLOCK);
+        {			          			            
+			Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, 
+				(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
+
+			int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); 

-            Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
-            lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
+			int block = 256;
+            int grid  = divUp(total, block);
+            lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
+			cudaSafeCall( cudaGetLastError() );
        }

-        int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
        {
-            int threads = ncandidates;
-            int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
-            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
-            return 0;
+            int block = ncandidates;
+            int smem  = block * ( sizeof(int) + sizeof(int4) );
+            disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
+			cudaSafeCall( cudaGetLastError() );
        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device
 {
    struct Emulation
    {
-        static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
+		template<int CTA_SIZE>
+        static __forceinline__ __device__ int Ballot(int predicate)
        {
-    #if __CUDA_ARCH__ >= 200
-            (void)cta_buffer;
+#if (__CUDA_ARCH__ >= 200) 
            return __ballot(predicate);
-    #else
+#else
+			__shared__ volatile int cta_buffer[CTA_SIZE]
+
            int tid = threadIdx.x;
            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
            return warp_reduce(cta_buffer);
-    #endif
+#endif
        }
+
+		struct smem
+		{
+			enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+			
+			template<typename T>
+			static __device__ __forceinline__ T atomicInc(T* address, T val)
+			{
+#if (__CUDA_ARCH__ < 120)
+
+#else
+			
+#endif
+		
+			}
+
+			template<typename T>
+			static __device__ __forceinline__ void atomicAdd(T* address, T val)
+			{
+#if (__CUDA_ARCH__ < 120)
+
+#else
+			
+#endif
+			}
+
+			template<typename T>
+			__device__ __forceinline__ T __atomicMin(T* address, T val)
+			{
+#if (__CUDA_ARCH__ < 120)
+
+#else
+			
+#endif
+			}
+		};
    };
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device {
 namespace lbp{

    #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
-template<typename T>
-__device__ __forceinline__ T __atomicInc(T* address, T val)
-{
-    T count;
-    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-    do
-    {
-        count = *address & TAG_MASK;
-        count = tag | (count + 1);
-        *address = count;
-    } while (*address != count);
-
-    return (count & TAG_MASK) - 1;
-}
-
-template<typename T>
-__device__ __forceinline__ void __atomicAdd(T* address, T val)
-{
-    T count;
-    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-    do
-    {
-        count = *address & TAG_MASK;
-        count = tag | (count + val);
-        *address = count;
-    } while (*address != count);
-}
-
-template<typename T>
-__device__ __forceinline__ T __atomicMin(T* address, T val)
-{
-    T count = min(*address, val);
-    do
-    {
-        *address = count;
-    } while (*address > count);
-
-    return count;
-}
+	
+	template<typename T>
+	__device__ __forceinline__ T __atomicInc(T* address, T val)
+	{
+		T count;
+		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+		do
+		{
+			count = *address & TAG_MASK;
+			count = tag | (count + 1);
+			*address = count;
+		} while (*address != count);
+
+		return (count & TAG_MASK) - 1;
+	}
+
+	template<typename T>
+	__device__ __forceinline__ void __atomicAdd(T* address, T val)
+	{
+		T count;
+		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+		do
+		{
+			count = *address & TAG_MASK;
+			count = tag | (count + val);
+			*address = count;
+		} while (*address != count);
+	}
+
+	template<typename T>
+	__device__ __forceinline__ T __atomicMin(T* address, T val)
+	{
+		T count = min(*address, val);
+		do
+		{
+			*address = count;
+		} while (*address > count);
+
+		return count;
+	}

    struct Stage
    {
@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)

        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
        {
-            float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5;
+            float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f;

            return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
                && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
                int p = labels[tid];
                int q = labels[id];

-                if (p < q)
-                {
-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-                    __atomicMin(labels + id, p);
-#else
-                    atomicMin(labels + id, p);
-#endif
-                }
-                else if (p > q)
-                {
-#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-                    __atomicMin(labels + tid, q);
+				if (p != q)
+				{
+					int m = min(p, q);
+#if (__CUDA_ARCH__ < 120)
+                    __atomicMin(labels + id, m);
 #else
-                    atomicMin(labels + tid, q);
+                    atomicMin(labels + id, m);
 #endif
-                }
+				}
            }
        }
        __syncthreads();