From aea4157340b3136e67c02a5565b152dbe5c73f2e Mon Sep 17 00:00:00 2001 From: sbokov <sbokov01@gmail.com> Date: Tue, 7 Jul 2015 23:23:51 +0300 Subject: [PATCH] Adding new HAL-accelerated MODE_SGBM_3WAY New mode is approximately 2-3 times faster than MODE_SGBM with minimal degradation in quality and uses universal HAL intrinsics. A performance test was added. The accuracy test was updated to support the new mode. --- modules/calib3d/include/opencv2/calib3d.hpp | 3 +- modules/calib3d/perf/perf_stereosgbm.cpp | 159 ++++ modules/calib3d/src/stereosgbm.cpp | 716 ++++++++++++++++++- modules/calib3d/test/test_stereomatching.cpp | 7 +- samples/cpp/stereo_match.cpp | 16 +- 5 files changed, 854 insertions(+), 47 deletions(-) create mode 100644 modules/calib3d/perf/perf_stereosgbm.cpp diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp index 66ffe2c1d3..1bbe31cda2 100644 --- a/modules/calib3d/include/opencv2/calib3d.hpp +++ b/modules/calib3d/include/opencv2/calib3d.hpp @@ -1555,7 +1555,8 @@ public: enum { MODE_SGBM = 0, - MODE_HH = 1 + MODE_HH = 1, + MODE_SGBM_3WAY = 2 }; CV_WRAP virtual int getPreFilterCap() const = 0; diff --git a/modules/calib3d/perf/perf_stereosgbm.cpp b/modules/calib3d/perf/perf_stereosgbm.cpp new file mode 100644 index 0000000000..8dc71625da --- /dev/null +++ b/modules/calib3d/perf/perf_stereosgbm.cpp @@ -0,0 +1,159 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3 - clause BSD License) + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met : + * + * *Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and / or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort(including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "perf_precomp.hpp" + +namespace cvtest +{ + +using std::tr1::tuple; +using std::tr1::get; +using namespace perf; +using namespace testing; +using namespace cv; + +void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_view); + +CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY); +typedef tuple<Size, int, SGBMModes> SGBMParams; +typedef TestBaseWithParam<SGBMParams> TestStereoCorresp; + +PERF_TEST_P( TestStereoCorresp, SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) ) +{ + RNG rng(0); + + SGBMParams params = GetParam(); + + Size sz = get<0>(params); + int num_disparities = get<1>(params); + int mode = get<2>(params); + + Mat src_left(sz, CV_8UC3); + Mat src_right(sz, CV_8UC3); + Mat dst(sz, CV_16S); + + MakeArtificialExample(rng,src_left,src_right); + + cv::setNumThreads(cv::getNumberOfCPUs()); + int wsize = 3; + int P1 = 8*src_left.channels()*wsize*wsize; + TEST_CYCLE() + { + Ptr<StereoSGBM> sgbm = StereoSGBM::create(0,num_disparities,wsize,P1,4*P1,1,63,25,0,0,mode); + sgbm->compute(src_left,src_right,dst); + } + + SANITY_CHECK(dst, .01, ERROR_RELATIVE); +} + +void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_right_view) +{ + int w = dst_left_view.cols; + int h = dst_left_view.rows; + + //params: + unsigned char bg_level = (unsigned char)rng.uniform(0.0,255.0); + unsigned char fg_level = (unsigned char)rng.uniform(0.0,255.0); + int rect_width = (int)rng.uniform(w/16,w/2); + int rect_height = (int)rng.uniform(h/16,h/2); + int rect_disparity = (int)(0.15*w); + double sigma = 3.0; + + int rect_x_offset = (w-rect_width) /2; + int rect_y_offset = (h-rect_height)/2; + + if(dst_left_view.channels()==3) + { + dst_left_view = Scalar(Vec3b(bg_level,bg_level,bg_level)); + dst_right_view = Scalar(Vec3b(bg_level,bg_level,bg_level)); + } + else + { + dst_left_view = Scalar(bg_level); + dst_right_view = Scalar(bg_level); + } + + Mat dst_left_view_rect = Mat(dst_left_view, Rect(rect_x_offset,rect_y_offset,rect_width,rect_height)); + if(dst_left_view.channels()==3) + dst_left_view_rect = Scalar(Vec3b(fg_level,fg_level,fg_level)); + else + dst_left_view_rect = Scalar(fg_level); + + rect_x_offset-=rect_disparity; + + Mat dst_right_view_rect = Mat(dst_right_view, Rect(rect_x_offset,rect_y_offset,rect_width,rect_height)); + if(dst_right_view.channels()==3) + dst_right_view_rect = Scalar(Vec3b(fg_level,fg_level,fg_level)); + else + dst_right_view_rect = Scalar(fg_level); + + //add some gaussian noise: + unsigned char *l, *r; + for(int i=0;i<h;i++) + { + l = dst_left_view.ptr(i); + r = dst_right_view.ptr(i); + + if(dst_left_view.channels()==3) + { + for(int j=0;j<w;j++) + { + l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma)); + l[1] = saturate_cast<unsigned char>(l[1] + rng.gaussian(sigma)); + l[2] = saturate_cast<unsigned char>(l[2] + rng.gaussian(sigma)); + l+=3; + + r[0] = saturate_cast<unsigned char>(r[0] + rng.gaussian(sigma)); + r[1] = saturate_cast<unsigned char>(r[1] + rng.gaussian(sigma)); + r[2] = saturate_cast<unsigned char>(r[2] + rng.gaussian(sigma)); + r+=3; + } + } + else + { + for(int j=0;j<w;j++) + { + l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma)); + l++; + + r[0] = saturate_cast<unsigned char>(r[0] + rng.gaussian(sigma)); + r++; + } + } + } +} + +} diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 4b0aa5a25b..afdd05c792 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -52,6 +52,7 @@ #include "precomp.hpp" #include <limits.h> +#include "opencv2/hal/intrin.hpp" namespace cv { @@ -180,10 +181,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, buffer -= minX2; cost -= minX1*D + minD; // simplify the cost indices inside the loop -#if CV_SSE2 - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); -#endif - #if 1 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width ) { @@ -211,43 +208,39 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, int u0 = std::min(ul, ur); u0 = std::min(u0, u); int u1 = std::max(ul, ur); u1 = std::max(u1, u); - #if CV_SSE2 - if( useSIMD ) - { - __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0); - __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128(); - __m128i ds = _mm_cvtsi32_si128(diff_scale); + #if CV_SIMD128 + v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0); + v_uint8x16 _u1 = v_setall_u8((uchar)u1); - for( int d = minD; d < maxD; d += 16 ) - { - __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d)); - __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d)); - __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2)); - __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u)); - __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v)); - __m128i diff = _mm_min_epu8(c0, c1); - - c0 = _mm_load_si128((__m128i*)(cost + x*D + d)); - c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8)); - - _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds))); - _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds))); - } + for( int d = minD; d < maxD; d += 16 ) + { + v_uint8x16 _v = v_load(prow2 + width-x-1 + d); + v_uint8x16 _v0 = v_load(buffer + width-x-1 + d); + v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2); + v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u); + v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v); + v_uint8x16 diff = v_min(c0, c1); + + v_int16x8 _c0 = v_load_aligned(cost + x*D + d); + v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8); + + v_uint16x8 diff1,diff2; + v_expand(diff,diff1,diff2); + v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale)); + v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale)); } - else - #endif + #else + for( int d = minD; d < maxD; d++ ) { - for( int d = minD; d < maxD; d++ ) - { - int v = prow2[width-x-1 + d]; - int v0 = buffer[width-x-1 + d]; - int v1 = buffer[width-x-1 + d + width2]; - int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u); - int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v); + int v = prow2[width-x-1 + d]; + int v0 = buffer[width-x-1 + d]; + int v1 = buffer[width-x-1 + d + width2]; + int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u); + int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v); - cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale)); - } + cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale)); } + #endif } } #else @@ -829,6 +822,645 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, } } +////////////////////////////////////////////////////////////////////////////////////////////////////// + +void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2, + CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, + PixType*& tmpBuf, CostType*& horPassCostVolume, + CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, + CostType*& disp2CostBuf, short*& disp2Buf); + +struct SGBM3WayMainLoop : public ParallelLoopBody +{ + Mat* buffers; + const Mat *img1, *img2; + Mat* dst_disp; + + int nstripes, stripe_sz; + int stripe_overlap; + + int width,height; + int minD, maxD, D; + int minX1, maxX1, width1; + + int SW2, SH2; + int P1, P2; + int uniquenessRatio, disp12MaxDiff; + + int costBufSize, hsumBufNRows; + int TAB_OFS, ftzero; + + PixType* clipTab; + + SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap); + void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const; + void operator () (const Range& range) const; +}; + +SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap): +buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab) +{ + nstripes = _nstripes; + stripe_overlap = _stripe_overlap; + stripe_sz = (int)ceil(img1->rows/(double)nstripes); + + width = img1->cols; height = img1->rows; + minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; + minX1 = std::max(-maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1; + CV_Assert( D % 16 == 0 ); + + SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1; + + P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); + uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; + disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; + + costBufSize = width1*D; + hsumBufNRows = SH2*2 + 2; + TAB_OFS = 256*4; + ftzero = std::max(params.preFilterCap, 15) | 1; +} + +void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2, + CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, + PixType*& tmpBuf, CostType*& horPassCostVolume, + CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, + CostType*& disp2CostBuf, short*& disp2Buf) +{ + // allocating all the required memory: + int costVolumeLineSize = width1*D; + int width1_ext = width1+2; + int costVolumeLineSize_ext = width1_ext*D; + int hsumBufNRows = SH2*2 + 2; + + // main buffer to store matching costs for the current line: + int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType); + + // auxiliary buffers for the raw matching cost computation: + int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType); + int pixDiffSize = costVolumeLineSize*sizeof(CostType); + int tmpBufSize = width*16*num_ch*sizeof(PixType); + + // auxiliary buffers for the matching cost aggregation: + int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation + int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation + int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line + int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass + + // buffers for the pseudo-LRC check: + int disp2CostBufSize = width*sizeof(CostType); + int disp2BufSize = width*sizeof(short); + + // sum up the sizes of all the buffers: + size_t totalBufSize = curCostVolumeLineSize + + hsumBufSize + + pixDiffSize + + tmpBufSize + + horPassCostVolumeSize + + vertPassCostVolumeSize + + vertPassMinSize + + rightPassBufSize + + disp2CostBufSize + + disp2BufSize + + 16; //to compensate for the alignPtr shifts + + if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) + buffer.create(1, (int)totalBufSize, CV_8U); + + // set up all the pointers: + curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16); + hsumBuf = curCostVolumeLine + costVolumeLineSize; + pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows; + tmpBuf = (PixType*)(pixDiff + costVolumeLineSize); + horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch); + vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext; + rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext; + vertPassMin = rightPassBuf + D; + disp2CostBuf = vertPassMin + width1_ext; + disp2Buf = disp2CostBuf + width; + + // initialize memory: + memset(buffer.ptr(),0,totalBufSize); + for(int i=0;i<costVolumeLineSize;i++) + curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit +} + +// performing block matching and building raw cost-volume for the current row +void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row + CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers + int y, int src_start_idx) const +{ + int x, d; + int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1; + + for(int k = dy1; k <= dy2; k++ ) + { + CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize; + if( k < height ) + { + calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero ); + + memset(hsumAdd, 0, D*sizeof(CostType)); + for(x = 0; x <= SW2*D; x += D ) + { + int scale = x == 0 ? SW2 + 1 : 1; + + for( d = 0; d < D; d++ ) + hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale); + } + + if( y > src_start_idx ) + { + const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; + + for( x = D; x < width1*D; x += D ) + { + const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); + +#if CV_SIMD128 + v_int16x8 hv_reg; + for( d = 0; d < D; d+=8 ) + { + hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d)); + v_store_aligned(hsumAdd+x+d,hv_reg); + v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d))); + } +#else + for( d = 0; d < D; d++ ) + { + int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]); + } +#endif + } + } + else + { + for( x = D; x < width1*D; x += D ) + { + const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); + + for( d = 0; d < D; d++ ) + hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + } + } + } + + if( y == src_start_idx ) + { + int scale = k == src_start_idx ? SH2 + 1 : 1; + for( x = 0; x < width1*D; x++ ) + C[x] = (CostType)(C[x] + hsumAdd[x]*scale); + } + } +} + +#if CV_SIMD128 && CV_SSE2 +// define some additional reduce operations: +inline short min(const v_int16x8& a) +{ + short CV_DECL_ALIGNED(16) buf[8]; + v_store_aligned(buf, a); + short s0 = std::min(buf[0], buf[1]); + short s1 = std::min(buf[2], buf[3]); + short s2 = std::min(buf[4], buf[5]); + short s3 = std::min(buf[6], buf[7]); + return std::min(std::min(s0, s1),std::min(s2, s3)); +} + +inline short min_pos(const v_int16x8& val,const v_int16x8& pos) +{ + short CV_DECL_ALIGNED(16) val_buf[8]; + v_store_aligned(val_buf, val); + short CV_DECL_ALIGNED(16) pos_buf[8]; + v_store_aligned(pos_buf, pos); + short res_pos = 0; + short min_val = SHRT_MAX; + if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];} + if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];} + if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];} + if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];} + if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];} + if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];} + if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];} + if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];} + return res_pos; +} +#endif + +// performing SGM cost accumulation from left to right (result is stored in leftBuf) and +// in-place cost accumulation from top to bottom (result is stored in topBuf) +inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs, + CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2) +{ +#if CV_SIMD128 && CV_SSE2 + v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1)); + + v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2)); + v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX); + v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX); + v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev); + + v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2)); + v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX); + v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX); + v_int16x8 src1_topBuf = v_load_aligned(topBuf); + + v_int16x8 src2; + v_int16x8 src_shifted_left,src_shifted_right; + v_int16x8 res; + + for(int i=0;i<D-8;i+=8) + { + //process leftBuf: + //lookahead load: + src2 = v_load_aligned(leftBuf_prev+i+8); + + //get shifted versions of the current block: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2)); + + // replace shifted-in zeros with proper values and add P1: + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val, 14)))+P1_reg; + + // process and save current block: + res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + leftMinCost_new_reg = v_min(leftMinCost_new_reg,res); + v_store_aligned(leftBuf+i, res); + + //update src buffers: + src0_leftBuf = src1_leftBuf; + src1_leftBuf = src2; + + //process topBuf: + //lookahead load: + src2 = v_load_aligned(topBuf+i+8); + + //get shifted versions of the current block: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2)); + + // replace shifted-in zeros with proper values and add P1: + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg; + + // process and save current block: + res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + topMinCost_new_reg = v_min(topMinCost_new_reg,res); + v_store_aligned(topBuf+i, res); + + //update src buffers: + src0_topBuf = src1_topBuf; + src1_topBuf = src2; + } + + // a bit different processing for the last cycle of the loop: + //process leftBuf: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2)); + + src2 = v_setall_s16(SHRT_MAX); + + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg; + + res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + leftMinCost = min(v_min(leftMinCost_new_reg,res)); + v_store_aligned(leftBuf+D-8, res); + + //process topBuf: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2)); + + src2 = v_setall_s16(SHRT_MAX); + + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg; + + res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + topMinCost = min(v_min(topMinCost_new_reg,res)); + v_store_aligned(topBuf+D-8, res); +#else + CostType leftMinCost_new = SHRT_MAX; + CostType topMinCost_new = SHRT_MAX; + int leftMinCost_P2 = leftMinCost + P2; + int topMinCost_P2 = topMinCost + P2; + CostType leftBuf_prev_i_minus_1 = SHRT_MAX; + CostType topBuf_i_minus_1 = SHRT_MAX; + CostType tmp; + + for(int i=0;i<D-1;i++) + { + leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2); + leftBuf_prev_i_minus_1 = leftBuf_prev[i]; + leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]); + + tmp = topBuf[i]; + topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2); + topBuf_i_minus_1 = tmp; + topMinCost_new = std::min(topMinCost_new,topBuf[i]); + } + + leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2); + leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]); + + topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2); + topMinCost = std::min(topMinCost_new,topBuf[D-1]); +#endif +} + +// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and +// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the +// optimal disparity value with minimum accumulated cost +inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs, + CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost) +{ +#if CV_SIMD128 && CV_SSE2 + v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1)); + + v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2)); + v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX); + v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX); + v_int16x8 src1_rightBuf = v_load(rightBuf); + + v_int16x8 src2; + v_int16x8 src_shifted_left,src_shifted_right; + v_int16x8 res; + + v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX); + v_int16x8 min_sum_pos_reg = v_setall_s16(0); + v_int16x8 loop_idx(0,1,2,3,4,5,6,7); + v_int16x8 eight_reg = v_setall_s16(8); + + for(int i=0;i<D-8;i+=8) + { + //lookahead load: + src2 = v_load_aligned(rightBuf+i+8); + + //get shifted versions of the current block: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2)); + + // replace shifted-in zeros with proper values and add P1: + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg; + + // process and save current block: + res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + rightMinCost_new_reg = v_min(rightMinCost_new_reg,res); + v_store_aligned(rightBuf+i, res); + + // compute and save total cost: + res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i); + v_store_aligned(leftBuf+i, res); + + // track disparity value with the minimum cost: + min_sum_cost_reg = v_min(min_sum_cost_reg,res); + min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg)); + loop_idx = loop_idx+eight_reg; + + //update src: + src0_rightBuf = src1_rightBuf; + src1_rightBuf = src2; + } + + // a bit different processing for the last cycle of the loop: + src_shifted_left = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2)); + src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2)); + + src2 = v_setall_s16(SHRT_MAX); + + src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg; + src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg; + + res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + rightMinCost = min(v_min(rightMinCost_new_reg,res)); + v_store_aligned(rightBuf+D-8, res); + + res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8); + v_store_aligned(leftBuf+D-8, res); + + min_sum_cost_reg = v_min(min_sum_cost_reg,res); + min_cost = min(min_sum_cost_reg); + min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg)); + optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg); +#else + CostType rightMinCost_new = SHRT_MAX; + int rightMinCost_P2 = rightMinCost + P2; + CostType rightBuf_i_minus_1 = SHRT_MAX; + CostType tmp; + min_cost = SHRT_MAX; + + for(int i=0;i<D-1;i++) + { + tmp = rightBuf[i]; + rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2); + rightBuf_i_minus_1 = tmp; + rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]); + leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]); + if(leftBuf[i]<min_cost) + { + optimal_disp = i; + min_cost = leftBuf[i]; + } + } + + rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2); + rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]); + leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]); + if(leftBuf[D-1]<min_cost) + { + optimal_disp = D-1; + min_cost = leftBuf[D-1]; + } +#endif +} + +void SGBM3WayMainLoop::operator () (const Range& range) const +{ + // force separate processing of stripes: + if(range.end>range.start+1) + { + for(int n=range.start;n<range.end;n++) + (*this)(Range(n,n+1)); + return; + } + + const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT); + int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; + + // setting up the ranges: + int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0); + int src_end_idx = std::min(range.end * stripe_sz, height); + + int dst_offset; + if(range.start==0) + dst_offset=stripe_overlap; + else + dst_offset=0; + + Mat cur_buffer = buffers [range.start]; + Mat cur_disp = dst_disp[range.start]; + cur_disp = Scalar(INVALID_DISP_SCALED); + + // prepare buffers: + CostType *curCostVolumeLine, *hsumBuf, *pixDiff; + PixType* tmpBuf; + CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf; + short* disp2Buf; + getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2, + curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume, + vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf); + + // start real processing: + for(int y=src_start_idx;y<src_end_idx;y++) + { + getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx); + + short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx)); + + // initialize the auxiliary buffers for the pseudo left-right consistency check: + for(int x=0;x<width;x++) + { + disp2Buf[x] = (short)INVALID_DISP_SCALED; + disp2CostBuf[x] = SHRT_MAX; + } + CostType* C = curCostVolumeLine - D; + CostType prev_min, min_cost; + int d, best_d; + d = best_d = 0; + + // forward pass + prev_min=0; + for (int x=D;x<(1+width1)*D;x+=D) + accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2); + + //backward pass + memset(rightPassBuf,0,D*sizeof(CostType)); + prev_min=0; + for (int x=width1*D;x>=D;x-=D) + { + accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost); + + if(uniquenessRatio>0) + { +#if CV_SIMD128 + horPassCostVolume+=x; + int thresh = (100*min_cost)/(100-uniquenessRatio); + v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1)); + v_int16x8 d1 = v_setall_s16((short)(best_d-1)); + v_int16x8 d2 = v_setall_s16((short)(best_d+1)); + v_int16x8 eight_reg = v_setall_s16(8); + v_int16x8 cur_d(0,1,2,3,4,5,6,7); + v_int16x8 mask,cost1,cost2; + + for( d = 0; d < D; d+=16 ) + { + cost1 = v_load_aligned(horPassCostVolume+d); + cost2 = v_load_aligned(horPassCostVolume+d+8); + + mask = cost1 < thresh_reg; + mask = mask & ( (cur_d<d1) | (cur_d>d2) ); + if( v_signmask(mask) ) + break; + + cur_d = cur_d+eight_reg; + + mask = cost2 < thresh_reg; + mask = mask & ( (cur_d<d1) | (cur_d>d2) ); + if( v_signmask(mask) ) + break; + + cur_d = cur_d+eight_reg; + } + horPassCostVolume-=x; +#else + for( d = 0; d < D; d++ ) + { + if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 ) + break; + } +#endif + if( d < D ) + continue; + } + d = best_d; + + int _x2 = x/D - 1 + minX1 - d - minD; + if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost ) + { + disp2CostBuf[_x2] = min_cost; + disp2Buf[_x2] = (short)(d + minD); + } + + if( 0 < d && d < D-1 ) + { + // do subpixel quadratic interpolation: + // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1]) + // then find minimum of the parabola. + int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1); + d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2); + } + else + d *= DISP_SCALE; + + disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE); + } + + for(int x = minX1; x < maxX1; x++ ) + { + // pseudo LRC consistency check using only one disparity map; + // pixels with difference more than disp12MaxDiff are invalidated + int d1 = disp_row[x]; + if( d1 == INVALID_DISP_SCALED ) + continue; + int _d = d1 >> StereoMatcher::DISP_SHIFT; + int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT; + int _x = x - _d, x_ = x - d_; + if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff && + 0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff ) + disp_row[x] = (short)INVALID_DISP_SCALED; + } + } +} + +static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2, + Mat& disp1, const StereoSGBMParams& params, + Mat* buffers, int nstripes ) +{ + // precompute a lookup table for the raw matching cost computation: + const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2; + PixType* clipTab = new PixType[TAB_SIZE]; + int ftzero = std::max(params.preFilterCap, 15) | 1; + for(int k = 0; k < TAB_SIZE; k++ ) + clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); + + // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap: + int stripe_sz = (int)ceil(img1.rows/(double)nstripes); + int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz); + Mat* dst_disp = new Mat[nstripes]; + for(int i=0;i<nstripes;i++) + dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S); + + parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap)); + + //assemble disp1 from dst_disp: + short* dst_row; + short* src_row; + for(int i=0;i<disp1.rows;i++) + { + dst_row = (short*)disp1.ptr(i); + src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz); + memcpy(dst_row,src_row,disp1.cols*sizeof(short)); + } + + delete[] clipTab; + delete[] dst_disp; +} + class StereoSGBMImpl : public StereoSGBM { public: @@ -857,7 +1489,11 @@ public: disparr.create( left.size(), CV_16S ); Mat disp = disparr.getMat(); - computeDisparitySGBM( left, right, disp, params, buffer ); + if(params.mode==MODE_SGBM_3WAY) + computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes ); + else + computeDisparitySGBM( left, right, disp, params, buffer ); + medianBlur(disp, disp, 3); if( params.speckleWindowSize > 0 ) @@ -933,6 +1569,12 @@ public: StereoSGBMParams params; Mat buffer; + + // the number of stripes is fixed, disregarding the number of threads/processors + // to make the results fully reproducible: + static const int num_stripes = 4; + Mat buffers[num_stripes]; + static const char* name_; }; diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp index 41290a1c3c..0aee42acee 100644 --- a/modules/calib3d/test/test_stereomatching.cpp +++ b/modules/calib3d/test/test_stereomatching.cpp @@ -742,7 +742,7 @@ protected: { int ndisp; int winSize; - bool fullDP; + int mode; }; vector<RunParams> caseRunParams; @@ -757,7 +757,7 @@ protected: RunParams params; String ndisp = fn[i+2]; params.ndisp = atoi(ndisp.c_str()); String winSize = fn[i+3]; params.winSize = atoi(winSize.c_str()); - String fullDP = fn[i+4]; params.fullDP = atoi(fullDP.c_str()) == 0 ? false : true; + String mode = fn[i+4]; params.mode = atoi(mode.c_str()); caseNames.push_back( caseName ); caseDatasets.push_back( datasetName ); caseRunParams.push_back( params ); @@ -773,8 +773,7 @@ protected: Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, params.ndisp, params.winSize, 10*params.winSize*params.winSize, 40*params.winSize*params.winSize, - 1, 63, 10, 100, 32, params.fullDP ? - StereoSGBM::MODE_HH : StereoSGBM::MODE_SGBM ); + 1, 63, 10, 100, 32, params.mode ); sgbm->compute( leftImg, rightImg, leftDisp ); CV_Assert( leftDisp.type() == CV_16SC1 ); leftDisp/=16; diff --git a/samples/cpp/stereo_match.cpp b/samples/cpp/stereo_match.cpp index fa5ee7bad1..3a2943afe0 100644 --- a/samples/cpp/stereo_match.cpp +++ b/samples/cpp/stereo_match.cpp @@ -20,7 +20,7 @@ using namespace cv; static void print_help() { printf("\nDemo stereo matching converting L and R images into disparity and point clouds\n"); - printf("\nUsage: stereo_match <left_image> <right_image> [--algorithm=bm|sgbm|hh] [--blocksize=<block_size>]\n" + printf("\nUsage: stereo_match <left_image> <right_image> [--algorithm=bm|sgbm|hh|sgbm3way] [--blocksize=<block_size>]\n" "[--max-disparity=<max_disparity>] [--scale=scale_factor>] [-i <intrinsic_filename>] [-e <extrinsic_filename>]\n" "[--no-display] [-o <disparity_image>] [-p <point_cloud_file>]\n"); } @@ -61,7 +61,7 @@ int main(int argc, char** argv) const char* disparity_filename = 0; const char* point_cloud_filename = 0; - enum { STEREO_BM=0, STEREO_SGBM=1, STEREO_HH=2, STEREO_VAR=3 }; + enum { STEREO_BM=0, STEREO_SGBM=1, STEREO_HH=2, STEREO_VAR=3, STEREO_3WAY=4 }; int alg = STEREO_SGBM; int SADWindowSize = 0, numberOfDisparities = 0; bool no_display = false; @@ -85,7 +85,8 @@ int main(int argc, char** argv) alg = strcmp(_alg, "bm") == 0 ? STEREO_BM : strcmp(_alg, "sgbm") == 0 ? STEREO_SGBM : strcmp(_alg, "hh") == 0 ? STEREO_HH : - strcmp(_alg, "var") == 0 ? STEREO_VAR : -1; + strcmp(_alg, "var") == 0 ? STEREO_VAR : + strcmp(_alg, "sgbm3way") == 0 ? STEREO_3WAY : -1; if( alg < 0 ) { printf("Command-line parameter error: Unknown stereo algorithm\n\n"); @@ -257,7 +258,12 @@ int main(int argc, char** argv) sgbm->setSpeckleWindowSize(100); sgbm->setSpeckleRange(32); sgbm->setDisp12MaxDiff(1); - sgbm->setMode(alg == STEREO_HH ? StereoSGBM::MODE_HH : StereoSGBM::MODE_SGBM); + if(alg==STEREO_HH) + sgbm->setMode(StereoSGBM::MODE_HH); + else if(alg==STEREO_SGBM) + sgbm->setMode(StereoSGBM::MODE_SGBM); + else if(alg==STEREO_3WAY) + sgbm->setMode(StereoSGBM::MODE_SGBM_3WAY); Mat disp, disp8; //Mat img1p, img2p, dispp; @@ -267,7 +273,7 @@ int main(int argc, char** argv) int64 t = getTickCount(); if( alg == STEREO_BM ) bm->compute(img1, img2, disp); - else if( alg == STEREO_SGBM || alg == STEREO_HH ) + else if( alg == STEREO_SGBM || alg == STEREO_HH || alg == STEREO_3WAY ) sgbm->compute(img1, img2, disp); t = getTickCount() - t; printf("Time elapsed: %fms\n", t*1000/getTickFrequency());