From a2430afcac2defd2d7f31a4e47d091be88ff2f0d Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Sat, 14 Jul 2012 16:23:56 +0000 Subject: [PATCH] minor LBP for GPU --- modules/gpu/src/cascadeclassifier.cpp | 4 +- modules/gpu/src/cuda/lbp.cu | 52 ++++----- .../gpu/src/opencv2/gpu/device/emulation.hpp | 48 ++++++++- modules/gpu/src/opencv2/gpu/device/lbp.hpp | 102 +++++++++--------- 4 files changed, 119 insertions(+), 87 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 0621300109..fd99db34c1 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device DevMem2D_ objects, unsigned int* classified); - int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); + void connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void bindIntegral(DevMem2Di integral); void unbindIntegral(); } @@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U); const int defaultObjSearchNum = 100; - const float grouping_eps = 0.2; + const float grouping_eps = 0.2f; if( !objects.empty() && objects.depth() == CV_32S) objects.reshape(4, 1); diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 4efeab5ac8..42ddd036d2 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device struct Classifier { - __host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features, - const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize) - : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight), - scale(_scale), step(_step), subsetSize(_subsetSize){} + __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, + const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize) + : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), + clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){} __device__ __forceinline__ void operator() (int y, int x, DevMem2D_ objects, const unsigned int maxN, unsigned int* n) const { @@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device rect.z = clWidth; rect.w = clHeight; -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) +#if (__CUDA_ARCH__ < 120) int res = __atomicInc(n, maxN); #else int res = atomicInc(n, maxN); @@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device extern __shared__ int sbuff[]; int* labels = sbuff; - int* rrects = (int*)(sbuff + n); + int* rrects = sbuff + n; Pr predicate(grouping_eps); partition(candidates, n, labels, predicate); @@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device __syncthreads(); int cls = labels[tid]; -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) +#if (__CUDA_ARCH__ < 120) __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); @@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device labels[tid] = 0; __syncthreads(); -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) +#if (__CUDA_ARCH__ < 120) __atomicInc((unsigned int*)labels + cls, n); #else atomicInc((unsigned int*)labels + cls, n); @@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device if (active && active >= groupThreshold) { - int* r1 = rrects + tid * 4; - int4 r_out; - r_out.x = r1[0]; - r_out.y = r1[1]; - r_out.z = r1[2]; - r_out.w = r1[3]; -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + int* r1 = rrects + tid * 4; + int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]); + +#if (__CUDA_ARCH__ < 120) objects[__atomicInc(nclasses, n)] = r_out; #else int aidx = atomicInc(nclasses, n); @@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) - { - const int THREADS_BLOCK = 256; - int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); - int blocks = divUp(work_amount, THREADS_BLOCK); + { + Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, + (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize); + + int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); - Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); - lbp_classify_stump<<>>(clr, objects, objects.cols, classified, workWidth >> 1); + int block = 256; + int grid = divUp(total, block); + lbp_classify_stump<<>>(clr, objects, objects.cols, classified, workWidth >> 1); + cudaSafeCall( cudaGetLastError() ); } - int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) + void connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) { - int threads = ncandidates; - int smem_amount = threads * sizeof(int) + threads * sizeof(int4); - disjoin<<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses); - return 0; + int block = ncandidates; + int smem = block * ( sizeof(int) + sizeof(int4) ); + disjoin<<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses); + cudaSafeCall( cudaGetLastError() ); } } }}} \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index 9b4de6c1a5..f3923a358f 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device { struct Emulation { - static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) + template + static __forceinline__ __device__ int Ballot(int predicate) { - #if __CUDA_ARCH__ >= 200 - (void)cta_buffer; +#if (__CUDA_ARCH__ >= 200) return __ballot(predicate); - #else +#else + __shared__ volatile int cta_buffer[CTA_SIZE] + int tid = threadIdx.x; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; return warp_reduce(cta_buffer); - #endif +#endif } + + struct smem + { + enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U }; + + template + static __device__ __forceinline__ T atomicInc(T* address, T val) + { +#if (__CUDA_ARCH__ < 120) + +#else + +#endif + + } + + template + static __device__ __forceinline__ void atomicAdd(T* address, T val) + { +#if (__CUDA_ARCH__ < 120) + +#else + +#endif + } + + template + __device__ __forceinline__ T __atomicMin(T* address, T val) + { +#if (__CUDA_ARCH__ < 120) + +#else + +#endif + } + }; }; }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index b3cf6dc277..8a7aa0eb92 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device { namespace lbp{ #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U ) -template -__device__ __forceinline__ T __atomicInc(T* address, T val) -{ - T count; - unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); - do - { - count = *address & TAG_MASK; - count = tag | (count + 1); - *address = count; - } while (*address != count); - - return (count & TAG_MASK) - 1; -} - -template -__device__ __forceinline__ void __atomicAdd(T* address, T val) -{ - T count; - unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); - do - { - count = *address & TAG_MASK; - count = tag | (count + val); - *address = count; - } while (*address != count); -} - -template -__device__ __forceinline__ T __atomicMin(T* address, T val) -{ - T count = min(*address, val); - do - { - *address = count; - } while (*address > count); - - return count; -} + + template + __device__ __forceinline__ T __atomicInc(T* address, T val) + { + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + 1); + *address = count; + } while (*address != count); + + return (count & TAG_MASK) - 1; + } + + template + __device__ __forceinline__ void __atomicAdd(T* address, T val) + { + T count; + unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U); + do + { + count = *address & TAG_MASK; + count = tag | (count + val); + *address = count; + } while (*address != count); + } + + template + __device__ __forceinline__ T __atomicMin(T* address, T val) + { + T count = min(*address, val); + do + { + *address = count; + } while (*address > count); + + return count; + } struct Stage { @@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const { - float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5; + float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f; return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta; @@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) int p = labels[tid]; int q = labels[id]; - if (p < q) - { -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) - __atomicMin(labels + id, p); -#else - atomicMin(labels + id, p); -#endif - } - else if (p > q) - { -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) - __atomicMin(labels + tid, q); + if (p != q) + { + int m = min(p, q); +#if (__CUDA_ARCH__ < 120) + __atomicMin(labels + id, m); #else - atomicMin(labels + tid, q); + atomicMin(labels + id, m); #endif - } + } } } __syncthreads();