minor LBP for GPU

pull/2/head
Anatoly Baksheev 13 years ago
parent 5120d690f0
commit a2430afcac
  1. 4
      modules/gpu/src/cascadeclassifier.cpp
  2. 52
      modules/gpu/src/cuda/lbp.cu
  3. 48
      modules/gpu/src/opencv2/gpu/device/emulation.hpp
  4. 102
      modules/gpu/src/opencv2/gpu/device/lbp.hpp

@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral);
void unbindIntegral();
}
@ -294,7 +294,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
const int defaultObjSearchNum = 100;
const float grouping_eps = 0.2;
const float grouping_eps = 0.2f;
if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1);

@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
struct Classifier
{
__host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
scale(_scale), step(_step), subsetSize(_subsetSize){}
__host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
{
@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
rect.z = clWidth;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
#if (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, maxN);
#else
int res = atomicInc(n, maxN);
@ -305,7 +305,7 @@ namespace cv { namespace gpu { namespace device
extern __shared__ int sbuff[];
int* labels = sbuff;
int* rrects = (int*)(sbuff + n);
int* rrects = sbuff + n;
Pr predicate(grouping_eps);
partition(candidates, n, labels, predicate);
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
__syncthreads();
int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
#if (__CUDA_ARCH__ < 120)
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace device
labels[tid] = 0;
__syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
#if (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n);
#else
atomicInc((unsigned int*)labels + cls, n);
@ -354,13 +354,10 @@ namespace cv { namespace gpu { namespace device
if (active && active >= groupThreshold)
{
int* r1 = rrects + tid * 4;
int4 r_out;
r_out.x = r1[0];
r_out.y = r1[1];
r_out.z = r1[2];
r_out.w = r1[3];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int* r1 = rrects + tid * 4;
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
#if (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = r_out;
#else
int aidx = atomicInc(nclasses, n);
@ -371,21 +368,24 @@ namespace cv { namespace gpu { namespace device
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
{
const int THREADS_BLOCK = 256;
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
int blocks = divUp(work_amount, THREADS_BLOCK);
{
Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
int block = 256;
int grid = divUp(total, block);
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
cudaSafeCall( cudaGetLastError() );
}
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
int threads = ncandidates;
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
return 0;
int block = ncandidates;
int smem = block * ( sizeof(int) + sizeof(int4) );
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
cudaSafeCall( cudaGetLastError() );
}
}
}}}

@ -49,17 +49,55 @@ namespace cv { namespace gpu { namespace device
{
struct Emulation
{
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
template<int CTA_SIZE>
static __forceinline__ __device__ int Ballot(int predicate)
{
#if __CUDA_ARCH__ >= 200
(void)cta_buffer;
#if (__CUDA_ARCH__ >= 200)
return __ballot(predicate);
#else
#else
__shared__ volatile int cta_buffer[CTA_SIZE]
int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
return warp_reduce(cta_buffer);
#endif
#endif
}
struct smem
{
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
template<typename T>
static __device__ __forceinline__ T atomicInc(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
#if (__CUDA_ARCH__ < 120)
#else
#endif
}
};
};
}}} // namespace cv { namespace gpu { namespace device

@ -50,45 +50,46 @@ namespace cv { namespace gpu { namespace device {
namespace lbp{
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
}
template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
}
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
}
template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
}
struct Stage
{
@ -112,7 +113,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
{
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5;
float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5f;
return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
&& abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;
@ -134,22 +135,15 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
int p = labels[tid];
int q = labels[id];
if (p < q)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicMin(labels + id, p);
#else
atomicMin(labels + id, p);
#endif
}
else if (p > q)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicMin(labels + tid, q);
if (p != q)
{
int m = min(p, q);
#if (__CUDA_ARCH__ < 120)
__atomicMin(labels + id, m);
#else
atomicMin(labels + tid, q);
atomicMin(labels + id, m);
#endif
}
}
}
}
__syncthreads();

Loading…
Cancel
Save