@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
{
{
namespace lbp
namespace lbp
{
{
texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez, ty);
anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez, ty);
anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 4;
response |= (~(anchors[1] >> 31)) & 2;;
response |= (~(anchors[2] >> 31)) & 1;
shift = (~(anchors[5] >> 31)) & 16;
shift |= (~(anchors[3] >> 31)) & 1;
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
shift |= (~(anchors[6] >> 31)) & 2;
shift |= (~(anchors[7] >> 31)) & 4;
shift |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
void bindIntegral(DevMem2Di integral)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
}
void unbindIntegral()
{
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
{
{
int x = threadIdx.x * step;
int x = threadIdx.x * step;
@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
{
{
float sum = 0;
float sum = 0;
Stage stage = stages[s];
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
for (int t = 0; t < stage.ntrees; t++)
{
{
ClNode node = nodes[current_node];
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
uchar4 feature = features[node.featureIdx];
int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
int shift;
const int* subsetIdx = subsets + (current_node * subsetSize);
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
sum += leaves[idx];
sum += leaves[idx];
current_node += 1;
current_node += 1;
current_leave += 2;
current_leave += 2;
}
}
if (sum < stage.threshold)
if (sum < stage.threshold)
return;
return;
}
}
@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
int4 rect;
int4 rect;
rect.x = roundf(x * scale);
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.y = roundf(y * scale);
rect.z = roundf( clWidth) ;
rect.z = clWidth;
rect.w = roundf( clHeight) ;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U);
int res = __atomicInc(n, 100U);
#else
#else
@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
}
}
}
}
void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
/* const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
DevMem2D_<int4> objects, unsigned int* classified)
DevMem2D_<int4> objects, unsigned int* classified)
{
{
int blocks = ceilf(workHeight / (float)step);
int blocks = ceilf(workHeight / (float)step);
@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
const float* leaves = mleaves.ptr();
const float* leaves = mleaves.ptr();
const int* subsets = msubsets.ptr();
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
const uchar4* features = (uchar4*)(mfeatures.ptr());
const int* integ = integral.ptr();
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
int istep = integral.step / sizeof(int);
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
}
}
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)