LBP: switched to texture implementation

pull/2/head
Marina Kolpakova 13 years ago
parent b0606b0557
commit 86d785622b
  1. 2
      modules/gpu/include/opencv2/gpu/gpu.hpp
  2. 8
      modules/gpu/perf/perf_objdetect.cpp
  3. 59
      modules/gpu/src/cascadeclassifier.cpp
  4. 120
      modules/gpu/src/cuda/lbp.cu
  5. 82
      modules/gpu/src/opencv2/gpu/device/lbp.hpp
  6. 5
      modules/gpu/test/test_objdetect.cpp

@ -1435,7 +1435,7 @@ public:
bool load(const std::string& filename);
void release();
int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
void preallocateIntegralBuffer(cv::Size desired);
Size getClassifierSize() const;

@ -69,16 +69,14 @@ GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
cv::gpu::GpuMat img(img_host);
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::CascadeClassifier_GPU_LBP cascade(img.size());
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
// cascade.detectMultiScale(img, objects_buffer);
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
TEST_CYCLE()
{
cascade.detectMultiScale(img, buffer, gpu_rects);
cascade.detectMultiScale(img, gpu_rects);
}
}

@ -70,7 +70,7 @@ Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const
void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size /*desired*/) { throw_nogpu();}
void cv::gpu::CascadeClassifier_GPU_LBP::initializeBuffers(cv::Size /*frame*/) { throw_nogpu();}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*scaledImageBuffer*/, cv::gpu::GpuMat& /*objectsBuf*/,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
#else
@ -299,28 +299,29 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
void classifyStump(const DevMem2Db mstages,
const int nstages,
const DevMem2Di mnodes,
const DevMem2Df mleaves,
const DevMem2Di msubsets,
const DevMem2Db mfeatures,
const DevMem2Di integral,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void classifyStump(const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral);
void unbindIntegral();
}
}}}
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
{
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
@ -332,10 +333,12 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1);
else
objects.create(1 , defaultObjSearchNum, CV_32SC4);
GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// GpuMat candidates(objects);
objects.create(1 , image.cols >> 4, CV_32SC4);
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size())
maxObjectSize = image.size();
@ -347,9 +350,11 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
cudaMalloc(&dclassified, sizeof(int));
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
int step;
cv::gpu::device::lbp::bindIntegral(integral);
for( double factor = 1; ; factor *= scaleFactor )
{
// if (factor > 2.0) break;
cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
@ -365,7 +370,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
GpuMat currBuff = integralBuffer;//(integralBuffer, cv::Rect(0, 0, integralBuffer.width, integralBuffer.height));
GpuMat currBuff = integralBuffer;
cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
@ -373,8 +378,10 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
step = (factor <= 2.) + 1;
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
scaledIntegral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
}
cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty())
return 0;
cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);

@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
{
namespace lbp
{
texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
{
int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
ty += fh;
anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez, ty);
anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
ty += fh;
anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez, ty);
anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 4;
response |= (~(anchors[1] >> 31)) & 2;;
response |= (~(anchors[2] >> 31)) & 1;
shift = (~(anchors[5] >> 31)) & 16;
shift |= (~(anchors[3] >> 31)) & 1;
ty += fh;
anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty);
anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
shift |= (~(anchors[6] >> 31)) & 2;
shift |= (~(anchors[7] >> 31)) & 4;
shift |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
void bindIntegral(DevMem2Di integral)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
}
void unbindIntegral()
{
cudaSafeCall( cudaUnbindTexture(&tintegral));
}
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
{
int x = threadIdx.x * step;
@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
const int* subsetIdx = subsets + (current_node * subsetSize);
int idx = (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
int shift;
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return;
}
@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
int4 rect;
rect.x = roundf(x * scale);
rect.y = roundf(y * scale);
rect.z = roundf(clWidth);
rect.w = roundf(clHeight);
rect.z = clWidth;
rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U);
#else
@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
}
}
void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
DevMem2D_<int4> objects, unsigned int* classified)
{
int blocks = ceilf(workHeight / (float)step);
@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
const float* leaves = mleaves.ptr();
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
const int* integ = integral.ptr();
int istep = integral.step / sizeof(int);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
}
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)

@ -153,90 +153,8 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
__syncthreads();
// printf("tid %d label %d\n", tid, labels[tid]);
}
struct LBP
{
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom
__device__ __forceinline__ int operator() (unsigned int y, int featurew, int featurez, const int* integral, int step) const
{
int x_off = 2 * featurez;
int anchors[9];
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + featurez + x_off];
y += featurew;
anchors[3] = integral[y];
anchors[4] = integral[y + featurez];
anchors[3] -= anchors[4];
anchors[5] = integral[y + x_off];
anchors[4] -= anchors[5];
anchors[5] -= integral[y + featurez + x_off];
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
y += featurew;
anchors[6] = integral[y];
anchors[7] = integral[y + featurez];
anchors[6] -= anchors[7];
anchors[8] = integral[y + x_off];
anchors[7] -= anchors[8];
anchors[8] -= integral[y + x_off + featurez];
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 128;
response |= (~(anchors[1] >> 31)) & 64;;
response |= (~(anchors[2] >> 31)) & 32;
response |= (~(anchors[5] >> 31)) & 16;
response |= (~(anchors[3] >> 31)) & 1;
y += featurew;
anchors[0] = integral[y];
anchors[1] = integral[y + featurez];
anchors[0] -= anchors[1];
anchors[2] = integral[y + x_off];
anchors[1] -= anchors[2];
anchors[2] -= integral[y + x_off + featurez];
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
response |= (~(anchors[6] >> 31)) & 2;
response |= (~(anchors[7] >> 31)) & 4;
response |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
} // lbp
} } }// namespaces
#endif

@ -343,15 +343,16 @@ TEST_P(LBP_classify, Accuracy)
cv::gpu::CascadeClassifier_GPU_LBP gpuClassifier;
ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
cv::gpu::GpuMat gpu_rects, buffer;
cv::gpu::GpuMat gpu_rects;
cv::gpu::GpuMat tested(grey);
int count = gpuClassifier.detectMultiScale(tested, buffer, gpu_rects);
int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
cv::Mat gpu_f(gpu_rects);
int* gpu_faces = (int*)gpu_f.ptr();
for (int i = 0; i < count; i++)
{
cv::Rect r(gpu_faces[i * 4],gpu_faces[i * 4 + 1],gpu_faces[i * 4 + 2],gpu_faces[i * 4 + 3]);
std::cout << gpu_faces[i * 4]<< " " << gpu_faces[i * 4 + 1] << " " << gpu_faces[i * 4 + 2] << " " << gpu_faces[i * 4 + 3] << std::endl;
cv::rectangle(markedImage, r , cv::Scalar(0, 0, 255, 255));
}
}

Loading…
Cancel
Save