added optimization

pull/2464/head
Konstantin Matskevich 11 years ago
parent b5f717b6b3
commit bfc843a5f5
  1. 2
      modules/calib3d/perf/opencl/perf_stereobm.cpp
  2. 95
      modules/calib3d/src/opencl/stereobm.cl
  3. 42
      modules/calib3d/src/stereobm.cpp
  4. 6
      modules/calib3d/test/opencl/test_stereobm.cpp

@ -68,7 +68,7 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32,
OCL_TEST_CYCLE() bm->compute(left, right, disp); OCL_TEST_CYCLE() bm->compute(left, right, disp);
SANITY_CHECK(disp, 1e-2, ERROR_RELATIVE); SANITY_CHECK(disp, 0.05, ERROR_RELATIVE);
} }
}//ocl }//ocl

@ -45,9 +45,102 @@
////////////////////////////////////////// stereoBM ////////////////////////////////////////////// ////////////////////////////////////////// stereoBM //////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef csize
__kernel void stereoBM_opt(__global const uchar * left, __global const uchar * right, __global uchar * dispptr,
int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp,
int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio)
{
int total_x = get_global_id(0);
int gx = get_group_id(0), x = gx*ndisp;
int y = get_global_id(1);
int d = get_local_id(0) + mindisp;
int wsz2 = winsize/2;
short FILTERED = (mindisp - 1)<<4;
__local int cost[csize];
int textsum[tsize];
if( total_x<cols && y<rows && d<ndisp)
{
int dispIdx = mad24(y, disp_step, disp_offset + total_x*(int)sizeof(short) );
__global short * disp = (__global short*)(dispptr + dispIdx);
disp[0] = FILTERED;
if( (total_x > ndisp-1) && (y > wsz2-1) && (total_x < cols + ndisp - cols%ndisp) && (y < rows - wsz2))
{
for(; (x <= ndisp+mindisp+wsz2-2); x++)
{
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX;
textsum[x%(gx*ndisp)] = INT_MAX;
}
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = 0;
textsum[x%(gx*ndisp)] = 0;
for(int i = -wsz2; i < wsz2+1; i++)
for(int j = -wsz2; j < wsz2+1; j++)
{
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )]
- right[min( y+i, rows-1 ) * cols + min( x+j-d, cols-1 )] );
textsum[x%(gx*ndisp)] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - preFilterCap );
}
x++;
for(; (x < gx*ndisp + ndisp) && (x < cols-wsz2-mindisp); x++)
{
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = cost[(d-mindisp)+ndisp*((x-1)%(gx*ndisp))];
textsum[x%(gx*ndisp)] = textsum[(x-1)%(gx*ndisp)];
for(int i = -wsz2; i < wsz2+1; i++)
{
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )]
- right[min( y+i, rows-1 ) * cols + min( x-wsz2-1-d, cols-1 )] ) +
abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )]
- right[min( y+i, rows-1 ) * cols + min( x+wsz2-d, cols-1 )] );
textsum[x%(gx*ndisp)] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )] - preFilterCap ) +
abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )] - preFilterCap );
}
}
for(; (x > cols - (cols-1)%ndisp - 1) && (x < cols + ndisp - (cols-1)%ndisp - 1); x++)
{
cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX;
textsum[x%(gx*ndisp)] = INT_MAX;
}
barrier(CLK_LOCAL_MEM_FENCE);
int best_disp = FILTERED, best_cost = INT_MAX-1;
for(int i = 0; (i < ndisp); i++)
{
best_cost = (cost[i + ndisp*(d-mindisp)] < best_cost) ? cost[i + ndisp*(d-mindisp)] : best_cost;
best_disp = (best_cost == cost[i + ndisp*(d-mindisp)]) ? i+mindisp : best_disp;
}
int thresh = best_cost + (best_cost * uniquenessRatio/100);
for(int i = 0; (i < ndisp) && (uniquenessRatio > 0); i++)
{
best_disp = ( (cost[i + ndisp*(d-mindisp)] <= thresh) && (i < best_disp - mindisp - 1 || i > best_disp - mindisp + 1) ) ?
FILTERED : best_disp;
}
disp[0] = textsum[d-mindisp] < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp);
if( best_disp != FILTERED )
{
int y1 = (best_disp > mindisp) ? cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)] :
cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)],
y2 = cost[(best_disp-mindisp) + ndisp*(d-mindisp)],
y3 = (best_disp < mindisp+ndisp-1) ? cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)] :
cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)];
float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/
((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp);
float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp);
disp[0] = (y1 == y2 || y3 == y2) ? (short)(best_disp*16) :(short)(-b/(2*a)*16);
}
}
}
}
#endif
#ifdef SIZE #ifdef SIZE
__kernel void stereoBM(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, __kernel void stereoBM_BF(__global const uchar * left, __global const uchar * right, __global uchar * dispptr,
int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp,
int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio)
{ {

@ -736,10 +736,39 @@ struct PrefilterInvoker : public ParallelLoopBody
StereoBMParams* state; StereoBMParams* state;
}; };
static bool ocl_stereo( InputArray _left, InputArray _right, static bool ocl_stereobm_opt( InputArray _left, InputArray _right,
OutputArray _disp, StereoBMParams* state) OutputArray _disp, StereoBMParams* state)
{ {
ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); int ndisp = state->numDisparities;
ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d", ndisp*ndisp, ndisp) );
if(k.empty())
return false;
UMat left = _left.getUMat(), right = _right.getUMat();
_disp.create(_left.size(), CV_16S);
UMat disp = _disp.getUMat();
size_t globalThreads[3] = { left.cols, left.rows, 1 };
size_t localThreads[3] = {ndisp, 1, 1};
int idx = 0;
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right));
idx = k.set(idx, ocl::KernelArg::WriteOnly(disp));
idx = k.set(idx, state->minDisparity);
idx = k.set(idx, ndisp);
idx = k.set(idx, state->preFilterCap);
idx = k.set(idx, state->SADWindowSize);
idx = k.set(idx, state->textureThreshold);
idx = k.set(idx, state->uniquenessRatio);
return k.run(2, globalThreads, localThreads, false);
}
static bool ocl_stereobm_bf(InputArray _left, InputArray _right,
OutputArray _disp, StereoBMParams* state)
{
ocl::Kernel k("stereoBM_BF", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) );
if(k.empty()) if(k.empty())
return false; return false;
@ -763,6 +792,15 @@ static bool ocl_stereo( InputArray _left, InputArray _right,
return k.run(2, globalThreads, NULL, false); return k.run(2, globalThreads, NULL, false);
} }
static bool ocl_stereo(InputArray _left, InputArray _right,
OutputArray _disp, StereoBMParams* state)
{
if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(int) )
return ocl_stereobm_opt(_left, _right, _disp, state);
else
return ocl_stereobm_bf(_left, _right, _disp, state);
}
struct FindStereoCorrespInvoker : public ParallelLoopBody struct FindStereoCorrespInvoker : public ParallelLoopBody
{ {
FindStereoCorrespInvoker( const Mat& _left, const Mat& _right, FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,

@ -85,11 +85,11 @@ OCL_TEST_P(StereoBMFixture, StereoBM)
OCL_OFF(bm->compute(left, right, disp)); OCL_OFF(bm->compute(left, right, disp));
OCL_ON(bm->compute(uleft, uright, udisp)); OCL_ON(bm->compute(uleft, uright, udisp));
Near(1e-2); Near(0.05);
} }
OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(128), OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128),
testing::Values(15))); testing::Values(11, 21)));
}//ocl }//ocl
}//cvtest }//cvtest

Loading…
Cancel
Save