From bfc843a5f53ec05277ee3fd5b7d800bcc52b8984 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Tue, 18 Feb 2014 14:08:22 +0400 Subject: [PATCH] added optimization --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 2 +- modules/calib3d/src/opencl/stereobm.cl | 95 ++++++++++++++++++- modules/calib3d/src/stereobm.cpp | 42 +++++++- modules/calib3d/test/opencl/test_stereobm.cpp | 6 +- 4 files changed, 138 insertions(+), 7 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index 3352e6b1a5..dd2bc9e0a9 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -68,7 +68,7 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, OCL_TEST_CYCLE() bm->compute(left, right, disp); - SANITY_CHECK(disp, 1e-2, ERROR_RELATIVE); + SANITY_CHECK(disp, 0.05, ERROR_RELATIVE); } }//ocl diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index d8f238b890..7ab58dfec1 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -45,9 +45,102 @@ ////////////////////////////////////////// stereoBM ////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef csize + +__kernel void stereoBM_opt(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, + int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, + int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) +{ + int total_x = get_global_id(0); + int gx = get_group_id(0), x = gx*ndisp; + int y = get_global_id(1); + int d = get_local_id(0) + mindisp; + int wsz2 = winsize/2; + short FILTERED = (mindisp - 1)<<4; + __local int cost[csize]; + int textsum[tsize]; + if( total_x ndisp-1) && (y > wsz2-1) && (total_x < cols + ndisp - cols%ndisp) && (y < rows - wsz2)) + { + for(; (x <= ndisp+mindisp+wsz2-2); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX; + textsum[x%(gx*ndisp)] = INT_MAX; + } + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = 0; + textsum[x%(gx*ndisp)] = 0; + for(int i = -wsz2; i < wsz2+1; i++) + for(int j = -wsz2; j < wsz2+1; j++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x+j-d, cols-1 )] ); + textsum[x%(gx*ndisp)] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - preFilterCap ); + } + x++; + for(; (x < gx*ndisp + ndisp) && (x < cols-wsz2-mindisp); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = cost[(d-mindisp)+ndisp*((x-1)%(gx*ndisp))]; + textsum[x%(gx*ndisp)] = textsum[(x-1)%(gx*ndisp)]; + for(int i = -wsz2; i < wsz2+1; i++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x-wsz2-1-d, cols-1 )] ) + + abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x+wsz2-d, cols-1 )] ); + textsum[x%(gx*ndisp)] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )] - preFilterCap ) + + abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )] - preFilterCap ); + } + } + + for(; (x > cols - (cols-1)%ndisp - 1) && (x < cols + ndisp - (cols-1)%ndisp - 1); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX; + textsum[x%(gx*ndisp)] = INT_MAX; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int best_disp = FILTERED, best_cost = INT_MAX-1; + for(int i = 0; (i < ndisp); i++) + { + best_cost = (cost[i + ndisp*(d-mindisp)] < best_cost) ? cost[i + ndisp*(d-mindisp)] : best_cost; + best_disp = (best_cost == cost[i + ndisp*(d-mindisp)]) ? i+mindisp : best_disp; + } + + int thresh = best_cost + (best_cost * uniquenessRatio/100); + for(int i = 0; (i < ndisp) && (uniquenessRatio > 0); i++) + { + best_disp = ( (cost[i + ndisp*(d-mindisp)] <= thresh) && (i < best_disp - mindisp - 1 || i > best_disp - mindisp + 1) ) ? + FILTERED : best_disp; + } + + disp[0] = textsum[d-mindisp] < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp); + + if( best_disp != FILTERED ) + { + int y1 = (best_disp > mindisp) ? cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)] : + cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)], + y2 = cost[(best_disp-mindisp) + ndisp*(d-mindisp)], + y3 = (best_disp < mindisp+ndisp-1) ? cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)] : + cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)]; + float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/ + ((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp); + float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp); + disp[0] = (y1 == y2 || y3 == y2) ? (short)(best_disp*16) :(short)(-b/(2*a)*16); + } + } + } +} + +#endif + #ifdef SIZE -__kernel void stereoBM(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, +__kernel void stereoBM_BF(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) { diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 510a457d54..05652f03d6 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -736,10 +736,39 @@ struct PrefilterInvoker : public ParallelLoopBody StereoBMParams* state; }; -static bool ocl_stereo( InputArray _left, InputArray _right, +static bool ocl_stereobm_opt( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) { - ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); + int ndisp = state->numDisparities; + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d", ndisp*ndisp, ndisp) ); + if(k.empty()) + return false; + + UMat left = _left.getUMat(), right = _right.getUMat(); + _disp.create(_left.size(), CV_16S); + UMat disp = _disp.getUMat(); + + size_t globalThreads[3] = { left.cols, left.rows, 1 }; + size_t localThreads[3] = {ndisp, 1, 1}; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); + idx = k.set(idx, ocl::KernelArg::WriteOnly(disp)); + idx = k.set(idx, state->minDisparity); + idx = k.set(idx, ndisp); + idx = k.set(idx, state->preFilterCap); + idx = k.set(idx, state->SADWindowSize); + idx = k.set(idx, state->textureThreshold); + idx = k.set(idx, state->uniquenessRatio); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_stereobm_bf(InputArray _left, InputArray _right, + OutputArray _disp, StereoBMParams* state) +{ + ocl::Kernel k("stereoBM_BF", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); if(k.empty()) return false; @@ -763,6 +792,15 @@ static bool ocl_stereo( InputArray _left, InputArray _right, return k.run(2, globalThreads, NULL, false); } +static bool ocl_stereo(InputArray _left, InputArray _right, + OutputArray _disp, StereoBMParams* state) +{ + if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(int) ) + return ocl_stereobm_opt(_left, _right, _disp, state); + else + return ocl_stereobm_bf(_left, _right, _disp, state); +} + struct FindStereoCorrespInvoker : public ParallelLoopBody { FindStereoCorrespInvoker( const Mat& _left, const Mat& _right, diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index c3903f6a81..15fa93aa4f 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -85,11 +85,11 @@ OCL_TEST_P(StereoBMFixture, StereoBM) OCL_OFF(bm->compute(left, right, disp)); OCL_ON(bm->compute(uleft, uright, udisp)); - Near(1e-2); + Near(0.05); } -OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(128), - testing::Values(15))); +OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128), + testing::Values(11, 21))); }//ocl }//cvtest