@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
__host__ __ device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__host__ __ device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez , ty);
anchors[1] = tex2D(tintegral, tx + fw , ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez , ty);
anchors[4] = tex2D(tintegral, tx + fw , ty);
anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[5] = tex2D(tintegral, tx + fw * 2, ty);
anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[5] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez , ty);
anchors[7] = tex2D(tintegral, tx + fw , ty);
anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[8] = tex2D(tintegral, tx + fw * 2, ty);
anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[8] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez , ty);
anchors[1] = tex2D(tintegral, tx + fw , ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
struct Classifier
{
int x = threadIdx.x * step;
int y = blockIdx.x * step;
__host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
: stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
scale(_scale), step(_step), subsetSize(_subsetSize){}
int current_node = 0;
int current_leave = 0;
LBP evaluator;
for (int s = 0; s < nstages; s++ )
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
int current_node = 0;
int current_leave = 0;
for (int s = 0; s < nstages; ++s)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int shift;
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int shift;
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return;
}
if (sum < stage.threshold)
return;
}
int4 rect;
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.z = clWidth;
rect.w = clHeight;
int4 rect;
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.z = clWidth;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U );
int res = __atomicInc(n, maxN );
#else
int res = atomicInc(n, 100U );
int res = atomicInc(n, maxN );
#endif
objects(0, res) = rect;
objects(0, res) = rect;
}
const Stage* stages;
const ClNode* nodes;
const float* leaves;
const int* subsets;
const uchar4* features;
const int nstages;
const int clWidth;
const int clHeight;
const float scale;
const int step;
const int subsetSize;
const LBP evaluator;
};
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n)
{
int x = threadIdx.x * classifier.step;
int y = blockIdx.x * classifier.step;
classifier(y, x, objects, maxN, n);
}
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
{
int x = threadIdx.x * lines * classifier.step;
if (x >= maxX) return;
int y = blockIdx.x * classifier.step / lines;
classifier(y, x, objects, maxN, n);
}
template<typename Pr>
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
using cv::gpu::device::VecTraits;
unsigned int tid = threadIdx.x;
extern __shared__ int sbuff[];
@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device
int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicAdd((int*)( rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((int*)( rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((int*)( rrects + cls * 4 + 2), candidates[tid].z);
__atomicAdd((int*)( rrects + cls * 4 + 3), candidates[tid].w);
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
__atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#else
atomicAdd((int*)( rrects + cls * 4 + 0), candidates[tid].x);
atomicAdd((int*)( rrects + cls * 4 + 1), candidates[tid].y);
atomicAdd((int*)( rrects + cls * 4 + 2), candidates[tid].z);
atomicAdd((int*)( rrects + cls * 4 + 3), candidates[tid].w);
atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#endif
__syncthreads();
labels[tid] = 0;
__syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n);
#else
atomicInc((unsigned int*)labels + cls, n);
#endif
__syncthreads();
*nclasses = 0;
int active = labels[tid];
@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device
r1[1] = saturate_cast<int>(r1[1] * s);
r1[2] = saturate_cast<int>(r1[2] * s);
r1[3] = saturate_cast<int>(r1[3] * s);
}
__syncthreads();
int n1 = active;
__syncthreads();
unsigned int j = 0;
if( active > groupThreshold )
{
for (j = 0; j < n; j++)
{
int n2 = labels[j];
if(!n2 || j == tid || n2 <= groupThreshold )
continue;
int* r2 = rrects + j * 4;
int dx = saturate_cast<int>( r2[2] * grouping_eps );
int dy = saturate_cast<int>( r2[3] * grouping_eps );
if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
(n2 > max(3, n1) || n1 < 3) )
break;
}
if( j == n)
{
if (active && active >= groupThreshold)
{
int* r1 = rrects + tid * 4;
int4 r_out;
r_out.x = r1[0];
r_out.y = r1[1];
r_out.z = r1[2];
r_out.w = r1[3];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]) ;
objects[__atomicInc(nclasses, n)] = r_out;
#else
objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
int aidx = atomicInc(nclasses, n);
objects[aidx] = r_out;
#endif
}
}
}
}
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
DevMem2D_<int4> objects, unsigned int* classified)
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
{
int blocks = ceilf(workHeight / (float)step);
int threads = ceilf(workWidth / (float)step);
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
}
void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
int maxX)
{
const int THREADS_BLOCK = 256;
int blocks = ceilf(workHeight / (float)step);
int threads = ceilf(workWidth / (float)step);
Stage* stages = (Stage*)(mstages.ptr());
ClNode* nodes = (ClNode*)(mnodes.ptr());
const float* leaves = mleaves.ptr();
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
int lines = divUp(threads, THREADS_BLOCK);
lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
}
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
int threads = candidates.col s;
int threads = n candidates;
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.col s, groupThreshold, grouping_eps, nclasses);
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), n candidates, groupThreshold, grouping_eps, nclasses);
return 0;
}
}