@ -1,6 +1,18 @@
///////////////////////////// OpenCL kernels for face detection //////////////////////////////
////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
//
// the code has been derived from the OpenCL Haar cascade kernel by
//
// Niko Li, newlife20080214@gmail.com
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
//
typedef struct __attribute__ ( ( aligned ( 4 ) ) ) OptHaarFeature
{
int4 ofs[3] __attribute__ ( ( aligned ( 4 ) ) ) ;
@ -20,6 +32,12 @@ typedef struct __attribute__((aligned(4))) Stump
}
Stump ;
typedef struct __attribute__ ( ( aligned ( 4 ) ) ) Node
{
int4 n __attribute__ ( ( aligned ( 4 ) ) ) ;
}
Node ;
typedef struct __attribute__ ( ( aligned ( 4 ) ) ) Stage
{
int first __attribute__ ( ( aligned ( 4 ) ) ) ;
@ -28,151 +46,614 @@ typedef struct __attribute__((aligned (4))) Stage
}
Stage ;
__kernel void runHaarClassifierStump (
typedef struct __attribute__ ( ( aligned ( 4 ) ) ) ScaleData
{
float scale __attribute__ ( ( aligned ( 4 ) ) ) ;
int szi_width __attribute__ ( ( aligned ( 4 ) ) ) ;
int szi_height __attribute__ ( ( aligned ( 4 ) ) ) ;
int layer_ofs __attribute__ ( ( aligned ( 4 ) ) ) ;
int ystep __attribute__ ( ( aligned ( 4 ) ) ) ;
}
ScaleData ;
# ifndef SUM_BUF_SIZE
# define SUM_BUF_SIZE 0
# endif
# ifndef NODE_COUNT
# define NODE_COUNT 1
# endif
__kernel __attribute__ ( ( reqd_work_group_size ( LOCAL_SIZE_X,LOCAL_SIZE_Y,1 ) ) )
void runHaarClassifier (
int nscales, __global const ScaleData* scaleData,
__global const int* sum,
int sumstep, int sumoffset,
__global const int* sqsum,
int sqsumstep, int sqsumoffset,
int _sumstep, int sumoffset,
__global const OptHaarFeature* optfeatures,
int nstages,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const Node* nodes,
__global const float* leaves0,
volatile __global int* facepos,
int2 imgsize, int xyscale, float factor,
int4 normrect, int2 windowsize, int maxFaces )
int4 normrect, int sqofs, int2 windowsize, int maxFaces )
{
int ix = get_global_id ( 0 ) *xyscale ;
int iy = get_global_id ( 1 ) *xyscale ;
sumstep /= sizeof ( int ) ;
sqsumstep /= sizeof ( int ) ;
int lx = get_local_id ( 0 ) ;
int ly = get_local_id ( 1 ) ;
int groupIdx = get_group_id ( 0 ) ;
int i, ngroups = get_global_size ( 0 ) /LOCAL_SIZE_X ;
int scaleIdx, tileIdx, stageIdx ;
int sumstep = ( int ) ( _sumstep/sizeof ( int ) ) ;
int4 nofs0 = ( int4 ) ( mad24 ( normrect.y, sumstep, normrect.x ) ,
mad24 ( normrect.y, sumstep, normrect.x + normrect.z ) ,
mad24 ( normrect.y + normrect.w, sumstep, normrect.x ) ,
mad24 ( normrect.y + normrect.w, sumstep, normrect.x + normrect.z ) ) ;
int normarea = normrect.z * normrect.w ;
float invarea = 1.f/normarea ;
int lidx = ly*LOCAL_SIZE_X + lx ;
if ( ix < imgsize.x && iy < imgsize.y )
# if SUM_BUF_SIZE > 0
int4 nofs = ( int4 ) ( mad24 ( normrect.y, SUM_BUF_STEP, normrect.x ) ,
mad24 ( normrect.y, SUM_BUF_STEP, normrect.x + normrect.z ) ,
mad24 ( normrect.y + normrect.w, SUM_BUF_STEP, normrect.x ) ,
mad24 ( normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z ) ) ;
# else
int4 nofs = nofs0 ;
# endif
# define LOCAL_SIZE ( LOCAL_SIZE_X*LOCAL_SIZE_Y )
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1] ;
# if SUM_BUF_SIZE > 0
__local int* ibuf = lstore ;
__local int* lcount = ibuf + SUM_BUF_SIZE ;
# else
__local int* lcount = lstore ;
# endif
__local float* lnf = ( __local float* ) ( lcount + 1 ) ;
__local float* lpartsum = lnf + LOCAL_SIZE ;
__local short* lbuf = ( __local short* ) ( lpartsum + LOCAL_SIZE ) ;
for ( scaleIdx = nscales-1 ; scaleIdx >= 0; scaleIdx-- )
{
int stageIdx ;
__global const Stump* stump = stumps ;
__global const int* psum = sum + mad24 ( iy, sumstep, ix ) ;
__global const int* pnsum = psum + mad24 ( normrect.y, sumstep, normrect.x ) ;
int normarea = normrect.z * normrect.w ;
float invarea = 1.f/normarea ;
float sval = ( pnsum[0] - pnsum[normrect.z] - pnsum[mul24 ( normrect.w, sumstep ) ] +
pnsum[mad24 ( normrect.w, sumstep, normrect.z ) ] ) *invarea ;
float sqval = ( sqsum[mad24 ( iy + normrect.y, sqsumstep, ix + normrect.x ) ] ) *invarea ;
float nf = ( float ) normarea * sqrt ( max ( sqval - sval * sval, 0.f ) ) ;
nf = nf > 0 ? nf : 1.f ;
for ( stageIdx = 0 ; stageIdx < nstages; stageIdx++ )
__global const ScaleData* s = scaleData + scaleIdx ;
int ystep = s->ystep ;
int2 worksize = ( int2 ) ( max ( s->szi_width - windowsize.x, 0 ) , max ( s->szi_height - windowsize.y, 0 ) ) ;
int2 ntiles = ( int2 ) ( ( worksize.x + LOCAL_SIZE_X-1 ) /LOCAL_SIZE_X,
( worksize.y + LOCAL_SIZE_Y-1 ) /LOCAL_SIZE_Y ) ;
int totalTiles = ntiles.x*ntiles.y ;
for ( tileIdx = groupIdx ; tileIdx < totalTiles; tileIdx += ngroups )
{
int i, ntrees = stages[stageIdx].ntrees ;
float s = 0.f ;
for ( i = 0 ; i < ntrees; i++, stump++ )
int ix0 = ( tileIdx % ntiles.x ) *LOCAL_SIZE_X ;
int iy0 = ( tileIdx / ntiles.x ) *LOCAL_SIZE_Y ;
int ix = lx, iy = ly ;
__global const int* psum0 = sum + mad24 ( iy0, sumstep, ix0 ) + s->layer_ofs ;
__global const int* psum1 = psum0 + mad24 ( iy, sumstep, ix ) ;
if ( ix0 >= worksize.x | | iy0 >= worksize.y )
continue ;
# if SUM_BUF_SIZE > 0
for ( i = lidx*4 ; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
{
float4 st = stump->st ;
__global const OptHaarFeature* f = optfeatures + as_int ( st.x ) ;
float4 weight = f->weight ;
int4 ofs = f->ofs[0] ;
sval = ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.x ;
ofs = f->ofs[1] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.y ;
if ( weight.z > 0 )
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP ;
vstore4 ( vload4 ( 0 , psum0 + mad24 ( dy, sumstep, dx ) ) , 0 , ibuf+i ) ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
# endif
if ( lidx == 0 )
lcount[0] = 0 ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
{
# if NODE_COUNT==1
__global const Stump* stump = ( __global const Stump* ) nodes ;
# else
__global const Node* node = nodes ;
__global const float* leaves = leaves0 ;
# endif
# if SUM_BUF_SIZE > 0
__local const int* psum = ibuf + mad24 ( iy, SUM_BUF_STEP, ix ) ;
# else
__global const int* psum = psum1 ;
# endif
__global const float* psqsum = ( __global const float* ) ( psum1 + sqofs ) ;
float sval = ( psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w] ) *invarea ;
float sqval = ( psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w] ) *invarea ;
float nf = ( float ) normarea * sqrt ( max ( sqval - sval * sval, 0.f ) ) ;
nf = nf > 0 ? nf : 1.f ;
for ( stageIdx = 0 ; stageIdx < splitstage; stageIdx++ )
{
ofs = f->ofs[2] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.z ;
int ntrees = stages[stageIdx].ntrees ;
float s = 0.f ;
# if NODE_COUNT==1
for ( i = 0 ; i < ntrees; i++ )
{
float4 st = stump[i].st ;
__global const OptHaarFeature* f = optfeatures + as_int ( st.x ) ;
float4 weight = f->weight ;
int4 ofs = f->ofs[0] ;
sval = ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.x ;
ofs = f->ofs[1] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.y ;
if ( weight.z > 0 )
{
ofs = f->ofs[2] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.z ;
}
s += ( sval < st.y*nf ) ? st.z : st.w ;
}
stump += ntrees ;
# else
for ( i = 0 ; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
{
int idx = 0 ;
do
{
int4 n = node[idx].n ;
__global const OptHaarFeature* f = optfeatures + n.x ;
float4 weight = f->weight ;
int4 ofs = f->ofs[0] ;
sval = ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.x ;
ofs = f->ofs[1] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.y ;
if ( weight.z > 0 )
{
ofs = f->ofs[2] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.z ;
}
idx = ( sval < as_float ( n.y ) *nf ) ? n.z : n.w ;
}
while ( idx > 0 ) ;
s += leaves[-idx] ;
}
# endif
if ( s < stages[stageIdx].threshold )
break ;
}
s += ( sval < st.y*nf ) ? st.z : st.w ;
if ( stageIdx == splitstage && ( ystep == 1 | | ((ix | iy ) & 1 ) == 0 ) )
{
int count = atomic_inc ( lcount ) ;
lbuf[count] = ( int ) ( ix | ( iy << 8 ) ) ;
lnf[count] = nf ;
}
}
if ( s < stages[stageIdx].threshold )
break ;
}
for ( stageIdx = splitstage ; stageIdx < nstages; stageIdx++ )
{
int nrects = lcount[0] ;
if ( stageIdx == nstages )
{
int nfaces = atomic_inc ( facepos ) ;
if ( nfaces < maxFaces )
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( nrects == 0 )
break ;
if ( lidx == 0 )
lcount[0] = 0 ;
{
# if NODE_COUNT == 1
__global const Stump* stump = ( __global const Stump* ) nodes + stages[stageIdx].first ;
# else
__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT ;
__global const float* leaves = leaves0 + stages[stageIdx].first* ( NODE_COUNT+1 ) ;
# endif
int nparts = LOCAL_SIZE / nrects ;
int ntrees = stages[stageIdx].ntrees ;
int ntrees_p = ( ntrees + nparts - 1 ) /nparts ;
int nr = lidx / nparts ;
int partidx = -1 , idxval = 0 ;
float partsum = 0.f, nf = 0.f ;
if ( nr < nrects )
{
partidx = lidx % nparts ;
idxval = lbuf[nr] ;
nf = lnf[nr] ;
{
int ntrees0 = ntrees_p*partidx ;
int ntrees1 = min ( ntrees0 + ntrees_p, ntrees ) ;
int ix1 = idxval & 255 , iy1 = idxval >> 8 ;
# if SUM_BUF_SIZE > 0
__local const int* psum = ibuf + mad24 ( iy1, SUM_BUF_STEP, ix1 ) ;
# else
__global const int* psum = psum0 + mad24 ( iy1, sumstep, ix1 ) ;
# endif
# if NODE_COUNT == 1
for ( i = ntrees0 ; i < ntrees1; i++ )
{
float4 st = stump[i].st ;
__global const OptHaarFeature* f = optfeatures + as_int ( st.x ) ;
float4 weight = f->weight ;
int4 ofs = f->ofs[0] ;
float sval = ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.x ;
ofs = f->ofs[1] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.y ;
//if ( weight.z > 0 )
{
ofs = f->ofs[2] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.z ;
}
partsum += ( sval < st.y*nf ) ? st.z : st.w ;
}
# else
for ( i = ntrees0 ; i < ntrees1; i++ )
{
int idx = 0 ;
do
{
int4 n = node[i*2 + idx].n ;
__global const OptHaarFeature* f = optfeatures + n.x ;
float4 weight = f->weight ;
int4 ofs = f->ofs[0] ;
float sval = ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.x ;
ofs = f->ofs[1] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.y ;
if ( weight.z > 0 )
{
ofs = f->ofs[2] ;
sval += ( psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w] ) *weight.z ;
}
idx = ( sval < as_float ( n.y ) *nf ) ? n.z : n.w ;
}
while ( idx > 0 ) ;
partsum += leaves[i*3-idx] ;
}
# endif
}
}
lpartsum[lidx] = partsum ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( partidx == 0 )
{
float s = lpartsum[nr*nparts] ;
for ( i = 1 ; i < nparts; i++ )
s += lpartsum[i + nr*nparts] ;
if ( s >= stages[stageIdx].threshold )
{
int count = atomic_inc ( lcount ) ;
lbuf[count] = idxval ;
lnf[count] = nf ;
}
}
}
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( stageIdx == nstages )
{
volatile __global int* face = facepos + 1 + nfaces*4 ;
face[0] = convert_int_rte ( ix*factor ) ;
face[1] = convert_int_rte ( iy*factor ) ;
face[2] = convert_int_rte ( windowsize.x*factor ) ;
face[3] = convert_int_rte ( windowsize.y*factor ) ;
int nrects = lcount[0] ;
if ( lidx < nrects )
{
int nfaces = atomic_inc ( facepos ) ;
if ( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3 ;
int val = lbuf[lidx] ;
face[0] = scaleIdx ;
face[1] = ix0 + ( val & 255 ) ;
face[2] = iy0 + ( val >> 8 ) ;
}
}
}
}
}
}
# undef CALC_SUM_OFS_
# define CALC_SUM_OFS_ ( p0, p1, p2, p3, ptr ) \
( ( ptr ) [p0] - ( ptr ) [p1] - ( ptr ) [p2] + ( ptr ) [p3] )
__kernel void runLBPClassifierStump (
__kernel void runLBPClassifierStumpSimple (
int nscales, __global const ScaleData* scaleData,
__global const int* sum,
int sumstep, int sumoffset,
int _ sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures,
int nstages,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const int* bitsets,
int bitsetSize,
volatile __global int* facepos,
int2 imgsize, int xyscale, float factor,
int2 windowsize, int maxFaces )
{
int ix = get_global_id ( 0 ) *xyscale ;
int iy = get_global_id ( 1 ) *xyscale ;
sumstep /= sizeof ( int ) ;
int lx = get_local_id ( 0 ) ;
int ly = get_local_id ( 1 ) ;
int local_size_x = get_local_size ( 0 ) ;
int local_size_y = get_local_size ( 1 ) ;
int groupIdx = get_group_id ( 1 ) *get_num_groups ( 0 ) + get_group_id ( 0 ) ;
int ngroups = get_num_groups ( 0 ) *get_num_groups ( 1 ) ;
int scaleIdx, tileIdx, stageIdx ;
int startStage = 0 , endStage = nstages ;
int sumstep = ( int ) ( _sumstep/sizeof ( int ) ) ;
if ( ix < imgsize.x && iy < imgsize.y )
for ( scaleIdx = nscales-1 ; scaleIdx >= 0; scaleIdx-- )
{
int stageIdx ;
__global const Stump* stump = stumps ;
__global const int* p = sum + mad24 ( iy, sumstep, ix ) ;
__global const ScaleData* s = scaleData + scaleIdx ;
int ystep = s->ystep ;
int2 worksize = ( int2 ) ( max ( s->szi_width - windowsize.x, 0 ) , max ( s->szi_height - windowsize.y, 0 ) ) ;
int2 ntiles = ( int2 ) ( ( worksize.x/ystep + local_size_x-1 ) /local_size_x,
( worksize.y/ystep + local_size_y-1 ) /local_size_y ) ;
int totalTiles = ntiles.x*ntiles.y ;
for ( stageIdx = 0 ; stageIdx < nstages; stageIdx++ )
for ( tileIdx = groupIdx ; tileIdx < totalTiles; tileIdx += ngroups )
{
int i, ntrees = stages[stageIdx].ntrees ;
float s = 0.f ;
for ( i = 0 ; i < ntrees; i++, stump++, bitsets += bitsetSize )
int iy = ( ( tileIdx / ntiles.x ) *local_size_y + ly ) *ystep ;
int ix = ( ( tileIdx % ntiles.x ) *local_size_x + lx ) *ystep ;
if ( ix < worksize.x && iy < worksize.y )
{
float4 st = stump->st ;
__global const OptLBPFeature* f = optfeatures + as_int ( st.x ) ;
int16 ofs = f->ofs ;
__global const int* p = sum + mad24 ( iy, sumstep, ix ) + s->layer_ofs ;
__global const Stump* stump = stumps ;
__global const int* bitset = bitset s;
# define CALC_SUM_OFS_ ( p0, p1, p2, p3, ptr ) \
( ( ptr ) [p0] - ( ptr ) [p1] - ( ptr ) [p2] + ( ptr ) [p3] )
for ( stageIdx = 0 ; stageIdx < endStage; stageIdx++ )
{
int i, ntrees = stages[stageIdx].ntrees ;
float s = 0.f ;
for ( i = 0 ; i < ntrees; i++, stump++, bitset += bitsetSize )
{
float4 st = stump->st ;
__global const OptLBPFeature* f = optfeatures + as_int ( st.x ) ;
int16 ofs = f->ofs ;
int cval = CALC_SUM_OFS_ ( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ) ;
int cval = CALC_SUM_OFS_ ( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ) ;
int mask, idx = ( CALC_SUM_OFS_ ( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0 ) ; // 0
idx | = ( CALC_SUM_OFS_ ( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0 ) ; // 1
idx | = ( CALC_SUM_OFS_ ( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0 ) ; // 2
int mask, idx = ( CALC_SUM_OFS_ ( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0 ) ; // 0
idx | = ( CALC_SUM_OFS_ ( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0 ) ; // 1
idx | = ( CALC_SUM_OFS_ ( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0 ) ; // 2
mask = ( CALC_SUM_OFS_ ( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0 ) ; // 5
mask | = ( CALC_SUM_OFS_ ( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0 ) ; // 8
mask | = ( CALC_SUM_OFS_ ( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0 ) ; // 7
mask | = ( CALC_SUM_OFS_ ( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0 ) ; // 6
mask | = ( CALC_SUM_OFS_ ( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0 ) ; // 7
mask = ( CALC_SUM_OFS_ ( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0 ) ; // 5
mask | = ( CALC_SUM_OFS_ ( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0 ) ; // 8
mask | = ( CALC_SUM_OFS_ ( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0 ) ; // 7
mask | = ( CALC_SUM_OFS_ ( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0 ) ; // 6
mask | = ( CALC_SUM_OFS_ ( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0 ) ; // 7
s += ( bitsets[idx] & ( 1 << mask ) ) ? st.z : st.w ;
}
s += ( bitset[idx] & ( 1 << mask ) ) ? st.z : st.w ;
}
if ( s < stages[stageIdx].threshold )
break ;
}
if ( s < stages[stageIdx].threshold )
break ;
if ( stageIdx == nstages )
{
int nfaces = atomic_inc ( facepos ) ;
if ( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3 ;
face[0] = scaleIdx ;
face[1] = ix ;
face[2] = iy ;
}
}
}
}
}
}
__kernel __attribute__ ( ( reqd_work_group_size ( LOCAL_SIZE_X,LOCAL_SIZE_Y,1 ) ) )
void runLBPClassifierStump (
int nscales, __global const ScaleData* scaleData,
__global const int* sum,
int _sumstep, int sumoffset,
__global const OptLBPFeature* optfeatures,
int splitstage, int nstages,
__global const Stage* stages,
__global const Stump* stumps,
__global const int* bitsets,
int bitsetSize,
volatile __global int* facepos,
int2 windowsize, int maxFaces )
{
int lx = get_local_id ( 0 ) ;
int ly = get_local_id ( 1 ) ;
int groupIdx = get_group_id ( 0 ) ;
int i, ngroups = get_global_size ( 0 ) /LOCAL_SIZE_X ;
int scaleIdx, tileIdx, stageIdx ;
int sumstep = ( int ) ( _sumstep/sizeof ( int ) ) ;
int lidx = ly*LOCAL_SIZE_X + lx ;
if ( stageIdx == nstages )
# define LOCAL_SIZE ( LOCAL_SIZE_X*LOCAL_SIZE_Y )
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1] ;
# if SUM_BUF_SIZE > 0
__local int* ibuf = lstore ;
__local int* lcount = ibuf + SUM_BUF_SIZE ;
# else
__local int* lcount = lstore ;
# endif
__local float* lpartsum = ( __local float* ) ( lcount + 1 ) ;
__local short* lbuf = ( __local short* ) ( lpartsum + LOCAL_SIZE ) ;
for ( scaleIdx = nscales-1 ; scaleIdx >= 0; scaleIdx-- )
{
__global const ScaleData* s = scaleData + scaleIdx ;
int ystep = s->ystep ;
int2 worksize = ( int2 ) ( max ( s->szi_width - windowsize.x, 0 ) , max ( s->szi_height - windowsize.y, 0 ) ) ;
int2 ntiles = ( int2 ) ( ( worksize.x + LOCAL_SIZE_X-1 ) /LOCAL_SIZE_X,
( worksize.y + LOCAL_SIZE_Y-1 ) /LOCAL_SIZE_Y ) ;
int totalTiles = ntiles.x*ntiles.y ;
for ( tileIdx = groupIdx ; tileIdx < totalTiles; tileIdx += ngroups )
{
int nfaces = atomic_inc ( facepos ) ;
if ( nfaces < maxFaces )
int ix0 = ( tileIdx % ntiles.x ) *LOCAL_SIZE_X ;
int iy0 = ( tileIdx / ntiles.x ) *LOCAL_SIZE_Y ;
int ix = lx, iy = ly ;
__global const int* psum0 = sum + mad24 ( iy0, sumstep, ix0 ) + s->layer_ofs ;
if ( ix0 >= worksize.x | | iy0 >= worksize.y )
continue ;
# if SUM_BUF_SIZE > 0
for ( i = lidx*4 ; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
{
volatile __global int* face = facepos + 1 + nfaces*4 ;
face[0] = convert_int_rte ( ix*factor ) ;
face[1] = convert_int_rte ( iy*factor ) ;
face[2] = convert_int_rte ( windowsize.x*factor ) ;
face[3] = convert_int_rte ( windowsize.y*factor ) ;
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP ;
vstore4 ( vload4 ( 0 , psum0 + mad24 ( dy, sumstep, dx ) ) , 0 , ibuf+i ) ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
# endif
if ( lidx == 0 )
lcount[0] = 0 ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
{
__global const Stump* stump = stumps ;
__global const int* bitset = bitsets ;
# if SUM_BUF_SIZE > 0
__local const int* p = ibuf + mad24 ( iy, SUM_BUF_STEP, ix ) ;
# else
__global const int* p = psum0 + mad24 ( iy, sumstep, ix ) ;
# endif
for ( stageIdx = 0 ; stageIdx < splitstage; stageIdx++ )
{
int ntrees = stages[stageIdx].ntrees ;
float s = 0.f ;
for ( i = 0 ; i < ntrees; i++, stump++, bitset += bitsetSize )
{
float4 st = stump->st ;
__global const OptLBPFeature* f = optfeatures + as_int ( st.x ) ;
int16 ofs = f->ofs ;
int cval = CALC_SUM_OFS_ ( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ) ;
int mask, idx = ( CALC_SUM_OFS_ ( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0 ) ; // 0
idx | = ( CALC_SUM_OFS_ ( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0 ) ; // 1
idx | = ( CALC_SUM_OFS_ ( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0 ) ; // 2
mask = ( CALC_SUM_OFS_ ( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0 ) ; // 5
mask | = ( CALC_SUM_OFS_ ( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0 ) ; // 8
mask | = ( CALC_SUM_OFS_ ( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0 ) ; // 7
mask | = ( CALC_SUM_OFS_ ( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0 ) ; // 6
mask | = ( CALC_SUM_OFS_ ( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0 ) ; // 7
s += ( bitset[idx] & ( 1 << mask ) ) ? st.z : st.w ;
}
if ( s < stages[stageIdx].threshold )
break ;
}
if ( stageIdx == splitstage && ( ystep == 1 | | ((ix | iy ) & 1 ) == 0 ) )
{
int count = atomic_inc ( lcount ) ;
lbuf[count] = ( int ) ( ix | ( iy << 8 ) ) ;
}
}
for ( stageIdx = splitstage ; stageIdx < nstages; stageIdx++ )
{
int nrects = lcount[0] ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( nrects == 0 )
break ;
if ( lidx == 0 )
lcount[0] = 0 ;
{
__global const Stump* stump = stumps + stages[stageIdx].first ;
__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize ;
int nparts = LOCAL_SIZE / nrects ;
int ntrees = stages[stageIdx].ntrees ;
int ntrees_p = ( ntrees + nparts - 1 ) /nparts ;
int nr = lidx / nparts ;
int partidx = -1 , idxval = 0 ;
float partsum = 0.f, nf = 0.f ;
if ( nr < nrects )
{
partidx = lidx % nparts ;
idxval = lbuf[nr] ;
{
int ntrees0 = ntrees_p*partidx ;
int ntrees1 = min ( ntrees0 + ntrees_p, ntrees ) ;
int ix1 = idxval & 255 , iy1 = idxval >> 8 ;
# if SUM_BUF_SIZE > 0
__local const int* p = ibuf + mad24 ( iy1, SUM_BUF_STEP, ix1 ) ;
# else
__global const int* p = psum0 + mad24 ( iy1, sumstep, ix1 ) ;
# endif
for ( i = ntrees0 ; i < ntrees1; i++ )
{
float4 st = stump[i].st ;
__global const OptLBPFeature* f = optfeatures + as_int ( st.x ) ;
int16 ofs = f->ofs ;
# define CALC_SUM_OFS_ ( p0, p1, p2, p3, ptr ) \
( ( ptr ) [p0] - ( ptr ) [p1] - ( ptr ) [p2] + ( ptr ) [p3] )
int cval = CALC_SUM_OFS_ ( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ) ;
int mask, idx = ( CALC_SUM_OFS_ ( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0 ) ; // 0
idx | = ( CALC_SUM_OFS_ ( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0 ) ; // 1
idx | = ( CALC_SUM_OFS_ ( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0 ) ; // 2
mask = ( CALC_SUM_OFS_ ( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0 ) ; // 5
mask | = ( CALC_SUM_OFS_ ( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0 ) ; // 8
mask | = ( CALC_SUM_OFS_ ( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0 ) ; // 7
mask | = ( CALC_SUM_OFS_ ( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0 ) ; // 6
mask | = ( CALC_SUM_OFS_ ( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0 ) ; // 7
partsum += ( bitset[i*bitsetSize + idx] & ( 1 << mask ) ) ? st.z : st.w ;
}
}
}
lpartsum[lidx] = partsum ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( partidx == 0 )
{
float s = lpartsum[nr*nparts] ;
for ( i = 1 ; i < nparts; i++ )
s += lpartsum[i + nr*nparts] ;
if ( s >= stages[stageIdx].threshold )
{
int count = atomic_inc ( lcount ) ;
lbuf[count] = idxval ;
}
}
}
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
if ( stageIdx == nstages )
{
int nrects = lcount[0] ;
if ( lidx < nrects )
{
int nfaces = atomic_inc ( facepos ) ;
if ( nfaces < maxFaces )
{
volatile __global int* face = facepos + 1 + nfaces*3 ;
int val = lbuf[lidx] ;
face[0] = scaleIdx ;
face[1] = ix0 + ( val & 255 ) ;
face[2] = iy0 + ( val >> 8 ) ;
}
}
}
}
}