added OpenCL optimization for LBP-based face detector

pull/2042/head
Vadim Pisarevsky 11 years ago
parent 1540910542
commit d084d19779
  1. 123
      modules/objdetect/src/cascadedetect.cpp
  2. 3
      modules/objdetect/src/cascadedetect.hpp
  3. 35
      modules/objdetect/src/opencl/cascadedetect.cl

@ -743,6 +743,14 @@ bool LBPEvaluator::setWindow( Point pt )
pwin = &sum.at<int>(pt); pwin = &sum.at<int>(pt);
return true; return true;
} }
void LBPEvaluator::getUMats(std::vector<UMat>& bufs)
{
bufs.clear();
bufs.push_back(usum);
bufs.push_back(ufbuf);
}
//---------------------------------------------- HOGEvaluator --------------------------------------- //---------------------------------------------- HOGEvaluator ---------------------------------------
bool HOGEvaluator::Feature :: read( const FileNode& node ) bool HOGEvaluator::Feature :: read( const FileNode& node )
@ -1162,50 +1170,84 @@ bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processin
bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize, bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
int yStep, double factor, Size sumSize0 ) int yStep, double factor, Size sumSize0 )
{ {
const int VECTOR_SIZE = 1; int featureType = getFeatureType();
Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>(); std::vector<UMat> bufs;
if( haar.empty() ) size_t globalsize[] = { processingRectSize.width/yStep, processingRectSize.height/yStep };
return false; bool ok = false;
haar->setImage(_image, data.origWinSize, sumSize0);
if( cascadeKernel.empty() )
{
cascadeKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc,
format("-D VECTOR_SIZE=%d", VECTOR_SIZE));
if( cascadeKernel.empty() )
return false;
}
if( ustages.empty() ) if( ustages.empty() )
{ {
copyVectorToUMat(data.stages, ustages); copyVectorToUMat(data.stages, ustages);
copyVectorToUMat(data.stumps, ustumps); copyVectorToUMat(data.stumps, ustumps);
if( !data.subsets.empty() )
copyVectorToUMat(data.subsets, usubsets);
} }
std::vector<UMat> bufs; if( featureType == FeatureEvaluator::HAAR )
haar->getUMats(bufs); {
CV_Assert(bufs.size() == 3); Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
if( haar.empty() )
Rect normrect = haar->getNormRect(); return false;
//processingRectSize = Size(yStep, yStep);
size_t globalsize[] = { (processingRectSize.width/yStep + VECTOR_SIZE-1)/VECTOR_SIZE, processingRectSize.height/yStep };
cascadeKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
(int)data.stages.size(),
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(ustumps),
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions haar->setImage(_image, data.origWinSize, sumSize0);
processingRectSize, if( haarKernel.empty() )
yStep, (float)factor, {
normrect, data.origWinSize, MAX_FACES); haarKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
bool ok = cascadeKernel.run(2, globalsize, 0, true); if( haarKernel.empty() )
return false;
}
haar->getUMats(bufs);
Rect normrect = haar->getNormRect();
haarKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
(int)data.stages.size(),
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(ustumps),
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
processingRectSize,
yStep, (float)factor,
normrect, data.origWinSize, MAX_FACES);
ok = haarKernel.run(2, globalsize, 0, true);
}
else if( featureType == FeatureEvaluator::LBP )
{
Ptr<LBPEvaluator> lbp = featureEvaluator.dynamicCast<LBPEvaluator>();
if( lbp.empty() )
return false;
lbp->setImage(_image, data.origWinSize, sumSize0);
if( lbpKernel.empty() )
{
lbpKernel.create("runLBPClassifierStump", ocl::objdetect::cascadedetect_oclsrc, "");
if( lbpKernel.empty() )
return false;
}
lbp->getUMats(bufs);
int subsetSize = (data.ncategories + 31)/32;
lbpKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
ocl::KernelArg::PtrReadOnly(bufs[1]), // optfeatures
// cascade classifier
(int)data.stages.size(),
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(ustumps),
ocl::KernelArg::PtrReadOnly(usubsets),
subsetSize,
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
processingRectSize,
yStep, (float)factor,
data.origWinSize, MAX_FACES);
ok = lbpKernel.run(2, globalsize, 0, true);
}
//CV_Assert(ok); //CV_Assert(ok);
return ok; return ok;
} }
@ -1254,6 +1296,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
double scaleFactor, Size minObjectSize, Size maxObjectSize, double scaleFactor, Size minObjectSize, Size maxObjectSize,
bool outputRejectLevels ) bool outputRejectLevels )
{ {
int featureType = getFeatureType();
Size imgsz = _image.size(); Size imgsz = _image.size();
int imgtype = _image.type(); int imgtype = _image.type();
@ -1267,7 +1310,8 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
maxObjectSize = imgsz; maxObjectSize = imgsz;
bool use_ocl = ocl::useOpenCL() && bool use_ocl = ocl::useOpenCL() &&
getFeatureType() == FeatureEvaluator::HAAR && (featureType == FeatureEvaluator::HAAR ||
featureType == FeatureEvaluator::LBP) &&
!isOldFormatCascade() && !isOldFormatCascade() &&
data.isStumpBased() && data.isStumpBased() &&
maskGenerator.empty() && maskGenerator.empty() &&
@ -1593,7 +1637,8 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
bool CascadeClassifierImpl::read_(const FileNode& root) bool CascadeClassifierImpl::read_(const FileNode& root)
{ {
tryOpenCL = true; tryOpenCL = true;
cascadeKernel = ocl::Kernel(); haarKernel = ocl::Kernel();
lbpKernel = ocl::Kernel();
ustages.release(); ustages.release();
ustumps.release(); ustumps.release();
if( !data.read(root) ) if( !data.read(root) )

@ -149,7 +149,7 @@ protected:
Ptr<MaskGenerator> maskGenerator; Ptr<MaskGenerator> maskGenerator;
UMat ugrayImage, uimageBuffer; UMat ugrayImage, uimageBuffer;
UMat ufacepos, ustages, ustumps, usubsets; UMat ufacepos, ustages, ustumps, usubsets;
ocl::Kernel cascadeKernel; ocl::Kernel haarKernel, lbpKernel;
bool tryOpenCL; bool tryOpenCL;
Mutex mtx; Mutex mtx;
@ -392,6 +392,7 @@ public:
virtual bool setImage(InputArray image, Size _origWinSize, Size); virtual bool setImage(InputArray image, Size _origWinSize, Size);
virtual bool setWindow(Point pt); virtual bool setWindow(Point pt);
virtual void getUMats(std::vector<UMat>& bufs);
int operator()(int featureIdx) const int operator()(int featureIdx) const
{ return optfeaturesPtr[featureIdx].calc(pwin); } { return optfeaturesPtr[featureIdx].calc(pwin); }

@ -105,7 +105,7 @@ __kernel void runHaarClassifierStump(
} }
} }
#if 0
__kernel void runLBPClassifierStump( __kernel void runLBPClassifierStump(
__global const int* sum, __global const int* sum,
int sumstep, int sumoffset, int sumstep, int sumoffset,
@ -119,45 +119,48 @@ __kernel void runLBPClassifierStump(
volatile __global int* facepos, volatile __global int* facepos,
int2 imgsize, int xyscale, float factor, int2 imgsize, int xyscale, float factor,
int4 normrect, int2 windowsize, int maxFaces) int2 windowsize, int maxFaces)
{ {
int ix = get_global_id(0)*xyscale; int ix = get_global_id(0)*xyscale;
int iy = get_global_id(1)*xyscale; int iy = get_global_id(1)*xyscale;
sumstep /= sizeof(int); sumstep /= sizeof(int);
sqsumstep /= sizeof(int);
if( ix < imgsize.x && iy < imgsize.y ) if( ix < imgsize.x && iy < imgsize.y )
{ {
int stageIdx; int stageIdx;
__global const Stump* stump = stumps; __global const Stump* stump = stumps;
__global const int* p = sum + mad24(iy, sumstep, ix);
for( stageIdx = 0; stageIdx < nstages; stageIdx++ ) for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
{ {
int i, ntrees = stages[stageIdx].ntrees; int i, ntrees = stages[stageIdx].ntrees;
float s = 0.f; float s = 0.f;
for( i = 0; i < ntrees; i++, stump++ ) for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
{ {
float4 st = stump->st; float4 st = stump->st;
__global const OptLBPFeature* f = optfeatures + as_int(st.x); __global const OptLBPFeature* f = optfeatures + as_int(st.x);
int16 ofs = f->ofs; int16 ofs = f->ofs;
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
int4 ofs = f->ofs[0]; int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
ofs = f->ofs[1]; idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
if( weight.z > 0 )
{
ofs = f->ofs[2];
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
}
s += (sval < st.y*nf) ? st.z : st.w; mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
s += (bitsets[idx] & (1 << mask)) ? st.z : st.w;
} }
if( s < stages[stageIdx].threshold ) if( s < stages[stageIdx].threshold )
break; break;
} }
if( stageIdx == nstages ) if( stageIdx == nstages )
@ -174,4 +177,4 @@ __kernel void runLBPClassifierStump(
} }
} }
} }
#endif

Loading…
Cancel
Save