Adding new HAL-accelerated MODE_SGBM_3WAY

New mode is approximately 2-3 times faster than MODE_SGBM with minimal degradation in quality and uses universal HAL intrinsics. A performance test was added. The accuracy test was updated to support the new mode.
10 years ago · aea4157340
parent ff1687d378
commit aea4157340
5 changed files with 854 additions and 47 deletions
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -1555,7 +1555,8 @@ public:
    enum
    {
        MODE_SGBM = 0,
-        MODE_HH   = 1
+        MODE_HH   = 1,
+        MODE_SGBM_3WAY = 2
    };

    CV_WRAP virtual int getPreFilterCap() const = 0;
--- a/modules/calib3d/perf/perf_stereosgbm.cpp
+++ b/modules/calib3d/perf/perf_stereosgbm.cpp
@ -0,0 +1,159 @@
+/*
+ *  By downloading, copying, installing or using the software you agree to this license.
+ *  If you do not agree to this license, do not download, install,
+ *  copy or use the software.
+ *
+ *
+ *  License Agreement
+ *  For Open Source Computer Vision Library
+ *  (3 - clause BSD License)
+ *
+ *  Redistribution and use in source and binary forms, with or without modification,
+ *  are permitted provided that the following conditions are met :
+ *
+ *  *Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and / or other materials provided with the distribution.
+ *
+ *  * Neither the names of the copyright holders nor the names of the contributors
+ *  may be used to endorse or promote products derived from this software
+ *  without specific prior written permission.
+ *
+ *  This software is provided by the copyright holders and contributors "as is" and
+ *  any express or implied warranties, including, but not limited to, the implied
+ *  warranties of merchantability and fitness for a particular purpose are disclaimed.
+ *  In no event shall copyright holders or contributors be liable for any direct,
+ *  indirect, incidental, special, exemplary, or consequential damages
+ *  (including, but not limited to, procurement of substitute goods or services;
+ *  loss of use, data, or profits; or business interruption) however caused
+ *  and on any theory of liability, whether in contract, strict liability,
+ *  or tort(including negligence or otherwise) arising in any way out of
+ *  the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "perf_precomp.hpp"
+
+namespace cvtest
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+
+void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_view);
+
+CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY);
+typedef tuple<Size, int, SGBMModes> SGBMParams;
+typedef TestBaseWithParam<SGBMParams> TestStereoCorresp;
+
+PERF_TEST_P( TestStereoCorresp, SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) )
+{
+    RNG rng(0);
+
+    SGBMParams params = GetParam();
+
+    Size sz              = get<0>(params);
+    int num_disparities  = get<1>(params);
+    int mode             = get<2>(params);
+
+    Mat src_left(sz, CV_8UC3);
+    Mat src_right(sz, CV_8UC3);
+    Mat dst(sz, CV_16S);
+
+    MakeArtificialExample(rng,src_left,src_right);
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+    int wsize = 3;
+    int P1 = 8*src_left.channels()*wsize*wsize;
+    TEST_CYCLE()
+    {
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create(0,num_disparities,wsize,P1,4*P1,1,63,25,0,0,mode);
+        sgbm->compute(src_left,src_right,dst);
+    }
+
+    SANITY_CHECK(dst, .01, ERROR_RELATIVE);
+}
+
+void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_right_view)
+{
+    int w = dst_left_view.cols;
+    int h = dst_left_view.rows;
+
+    //params:
+    unsigned char bg_level = (unsigned char)rng.uniform(0.0,255.0);
+    unsigned char fg_level = (unsigned char)rng.uniform(0.0,255.0);
+    int rect_width  = (int)rng.uniform(w/16,w/2);
+    int rect_height = (int)rng.uniform(h/16,h/2);
+    int rect_disparity = (int)(0.15*w);
+    double sigma = 3.0;
+
+    int rect_x_offset = (w-rect_width) /2;
+    int rect_y_offset = (h-rect_height)/2;
+
+    if(dst_left_view.channels()==3)
+    {
+        dst_left_view  = Scalar(Vec3b(bg_level,bg_level,bg_level));
+        dst_right_view = Scalar(Vec3b(bg_level,bg_level,bg_level));
+    }
+    else
+    {
+        dst_left_view  = Scalar(bg_level);
+        dst_right_view = Scalar(bg_level);
+    }
+
+    Mat dst_left_view_rect = Mat(dst_left_view, Rect(rect_x_offset,rect_y_offset,rect_width,rect_height));
+    if(dst_left_view.channels()==3)
+        dst_left_view_rect = Scalar(Vec3b(fg_level,fg_level,fg_level));
+    else
+        dst_left_view_rect = Scalar(fg_level);
+
+    rect_x_offset-=rect_disparity;
+
+    Mat dst_right_view_rect = Mat(dst_right_view, Rect(rect_x_offset,rect_y_offset,rect_width,rect_height));
+    if(dst_right_view.channels()==3)
+        dst_right_view_rect = Scalar(Vec3b(fg_level,fg_level,fg_level));
+    else
+        dst_right_view_rect = Scalar(fg_level);
+
+    //add some gaussian noise:
+    unsigned char *l, *r;
+    for(int i=0;i<h;i++)
+    {
+        l = dst_left_view.ptr(i);
+        r = dst_right_view.ptr(i);
+
+        if(dst_left_view.channels()==3)
+        {
+            for(int j=0;j<w;j++)
+            {
+                l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma));
+                l[1] = saturate_cast<unsigned char>(l[1] + rng.gaussian(sigma));
+                l[2] = saturate_cast<unsigned char>(l[2] + rng.gaussian(sigma));
+                l+=3;
+
+                r[0] = saturate_cast<unsigned char>(r[0] + rng.gaussian(sigma));
+                r[1] = saturate_cast<unsigned char>(r[1] + rng.gaussian(sigma));
+                r[2] = saturate_cast<unsigned char>(r[2] + rng.gaussian(sigma));
+                r+=3;
+            }
+        }
+        else
+        {
+            for(int j=0;j<w;j++)
+            {
+                l[0] = saturate_cast<unsigned char>(l[0] + rng.gaussian(sigma));
+                l++;
+
+                r[0] = saturate_cast<unsigned char>(r[0] + rng.gaussian(sigma));
+                r++;
+            }
+        }
+    }
+}
+
+}
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@ -52,6 +52,7 @@

 #include "precomp.hpp"
 #include <limits.h>
+#include "opencv2/hal/intrin.hpp"

 namespace cv
 {
@ -180,10 +181,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
    buffer -= minX2;
    cost -= minX1*D + minD; // simplify the cost indices inside the loop

-#if CV_SSE2
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-#endif
-
 #if 1
    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
    {
@ -211,43 +208,39 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
            int u0 = std::min(ul, ur); u0 = std::min(u0, u);
            int u1 = std::max(ul, ur); u1 = std::max(u1, u);

-        #if CV_SSE2
-            if( useSIMD )
-            {
-                __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
-                __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
-                __m128i ds = _mm_cvtsi32_si128(diff_scale);
+        #if CV_SIMD128
+            v_uint8x16 _u  = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
+            v_uint8x16 _u1 = v_setall_u8((uchar)u1);

-                for( int d = minD; d < maxD; d += 16 )
-                {
-                    __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
-                    __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
-                    __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
-                    __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
-                    __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
-                    __m128i diff = _mm_min_epu8(c0, c1);
-
-                    c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
-                    c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
-
-                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
-                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
-                }
+            for( int d = minD; d < maxD; d += 16 )
+            {
+                v_uint8x16 _v  = v_load(prow2  + width-x-1 + d);
+                v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
+                v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
+                v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
+                v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
+                v_uint8x16 diff = v_min(c0, c1);
+
+                v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
+                v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
+
+                v_uint16x8 diff1,diff2;
+                v_expand(diff,diff1,diff2);
+                v_store_aligned(cost + x*D + d,     _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
+                v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
            }
-            else
-        #endif
+        #else
+            for( int d = minD; d < maxD; d++ )
            {
-                for( int d = minD; d < maxD; d++ )
-                {
-                    int v = prow2[width-x-1 + d];
-                    int v0 = buffer[width-x-1 + d];
-                    int v1 = buffer[width-x-1 + d + width2];
-                    int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
-                    int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
+                int v = prow2[width-x-1 + d];
+                int v0 = buffer[width-x-1 + d];
+                int v1 = buffer[width-x-1 + d + width2];
+                int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
+                int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);

-                    cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
-                }
+                cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
            }
+        #endif
        }
    }
 #else
@ -829,6 +822,645 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
    }
 }

+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
+                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
+                       PixType*& tmpBuf, CostType*& horPassCostVolume,
+                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
+                       CostType*& disp2CostBuf, short*& disp2Buf);
+
+struct SGBM3WayMainLoop : public ParallelLoopBody
+{
+    Mat* buffers;
+    const Mat *img1, *img2;
+    Mat* dst_disp;
+
+    int nstripes, stripe_sz;
+    int stripe_overlap;
+
+    int width,height;
+    int minD, maxD, D;
+    int minX1, maxX1, width1;
+
+    int SW2, SH2;
+    int P1, P2;
+    int uniquenessRatio, disp12MaxDiff;
+
+    int costBufSize, hsumBufNRows;
+    int TAB_OFS, ftzero;
+
+    PixType* clipTab;
+
+    SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
+    void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
+    void operator () (const Range& range) const;
+};
+
+SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
+buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
+{
+    nstripes = _nstripes;
+    stripe_overlap = _stripe_overlap;
+    stripe_sz = (int)ceil(img1->rows/(double)nstripes);
+
+    width = img1->cols; height = img1->rows;
+    minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
+    minX1 = std::max(-maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
+    CV_Assert( D % 16 == 0 );
+
+    SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
+
+    P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
+    uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+    disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+
+    costBufSize = width1*D;
+    hsumBufNRows = SH2*2 + 2;
+    TAB_OFS = 256*4;
+    ftzero = std::max(params.preFilterCap, 15) | 1;
+}
+
+void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
+                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
+                       PixType*& tmpBuf, CostType*& horPassCostVolume,
+                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
+                       CostType*& disp2CostBuf, short*& disp2Buf)
+{
+    // allocating all the required memory:
+    int costVolumeLineSize = width1*D;
+    int width1_ext = width1+2;
+    int costVolumeLineSize_ext = width1_ext*D;
+    int hsumBufNRows = SH2*2 + 2;
+
+    // main buffer to store matching costs for the current line:
+    int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
+
+    // auxiliary buffers for the raw matching cost computation:
+    int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
+    int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
+    int tmpBufSize   = width*16*num_ch*sizeof(PixType);
+
+    // auxiliary buffers for the matching cost aggregation:
+    int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
+    int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
+    int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
+    int rightPassBufSize       = D*sizeof(CostType);                      // additional small buffer for the right-to-left pass
+
+    // buffers for the pseudo-LRC check:
+    int disp2CostBufSize = width*sizeof(CostType);
+    int disp2BufSize     = width*sizeof(short);
+
+    // sum up the sizes of all the buffers:
+    size_t totalBufSize = curCostVolumeLineSize +
+                          hsumBufSize +
+                          pixDiffSize +
+                          tmpBufSize  +
+                          horPassCostVolumeSize +
+                          vertPassCostVolumeSize +
+                          vertPassMinSize +
+                          rightPassBufSize +
+                          disp2CostBufSize +
+                          disp2BufSize +
+                          16;  //to compensate for the alignPtr shifts
+
+    if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
+        buffer.create(1, (int)totalBufSize, CV_8U);
+
+    // set up all the pointers:
+    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), 16);
+    hsumBuf            = curCostVolumeLine + costVolumeLineSize;
+    pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
+    tmpBuf             = (PixType*)(pixDiff + costVolumeLineSize);
+    horPassCostVolume  = (CostType*)(tmpBuf + width*16*num_ch);
+    vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
+    rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
+    vertPassMin        = rightPassBuf + D;
+    disp2CostBuf       = vertPassMin + width1_ext;
+    disp2Buf           = disp2CostBuf + width;
+
+    // initialize memory:
+    memset(buffer.ptr(),0,totalBufSize);
+    for(int i=0;i<costVolumeLineSize;i++)
+        curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
+}
+
+// performing block matching and building raw cost-volume for the current row
+void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
+                                          CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
+                                          int y, int src_start_idx) const
+{
+    int x, d;
+    int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
+
+    for(int k = dy1; k <= dy2; k++ )
+    {
+        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+        if( k < height )
+        {
+            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
+
+            memset(hsumAdd, 0, D*sizeof(CostType));
+            for(x = 0; x <= SW2*D; x += D )
+            {
+                int scale = x == 0 ? SW2 + 1 : 1;
+
+                for( d = 0; d < D; d++ )
+                    hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+            }
+
+            if( y > src_start_idx )
+            {
+                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+
+                for( x = D; x < width1*D; x += D )
+                {
+                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+#if CV_SIMD128
+                    v_int16x8 hv_reg;
+                    for( d = 0; d < D; d+=8 )
+                    {
+                        hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
+                        v_store_aligned(hsumAdd+x+d,hv_reg);
+                        v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
+                    }
+#else
+                    for( d = 0; d < D; d++ )
+                    {
+                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                        C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
+                    }
+#endif
+                }
+            }
+            else
+            {
+                for( x = D; x < width1*D; x += D )
+                {
+                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+                    for( d = 0; d < D; d++ )
+                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                }
+            }
+        }
+
+        if( y == src_start_idx )
+        {
+            int scale = k == src_start_idx ? SH2 + 1 : 1;
+            for( x = 0; x < width1*D; x++ )
+                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+        }
+    }
+}
+
+#if CV_SIMD128 && CV_SSE2
+// define some additional reduce operations:
+inline short min(const v_int16x8& a)
+{
+    short CV_DECL_ALIGNED(16) buf[8];
+    v_store_aligned(buf, a);
+    short s0 = std::min(buf[0], buf[1]);
+    short s1 = std::min(buf[2], buf[3]);
+    short s2 = std::min(buf[4], buf[5]);
+    short s3 = std::min(buf[6], buf[7]);
+    return std::min(std::min(s0, s1),std::min(s2, s3));
+}
+
+inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
+{
+    short CV_DECL_ALIGNED(16) val_buf[8];
+    v_store_aligned(val_buf, val);
+    short CV_DECL_ALIGNED(16) pos_buf[8];
+    v_store_aligned(pos_buf, pos);
+    short res_pos = 0;
+    short min_val = SHRT_MAX;
+    if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
+    if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
+    if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
+    if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
+    if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
+    if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
+    if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
+    if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
+    return res_pos;
+}
+#endif
+
+// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
+// in-place cost accumulation from top to bottom (result is stored in topBuf)
+inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
+                                   CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
+{
+#if CV_SIMD128 && CV_SSE2
+    v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
+
+    v_int16x8 leftMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
+    v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
+    v_int16x8 src0_leftBuf        = v_setall_s16(SHRT_MAX);
+    v_int16x8 src1_leftBuf        = v_load_aligned(leftBuf_prev);
+
+    v_int16x8 topMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
+    v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
+    v_int16x8 src0_topBuf        = v_setall_s16(SHRT_MAX);
+    v_int16x8 src1_topBuf        = v_load_aligned(topBuf);
+
+    v_int16x8 src2;
+    v_int16x8 src_shifted_left,src_shifted_right;
+    v_int16x8 res;
+
+    for(int i=0;i<D-8;i+=8)
+    {
+        //process leftBuf:
+        //lookahead load:
+        src2 = v_load_aligned(leftBuf_prev+i+8);
+
+        //get shifted versions of the current block:
+        src_shifted_left  = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2));
+        src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2));
+
+        // replace shifted-in zeros with proper values and add P1:
+        src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg;
+        src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val,         14)))+P1_reg;
+
+        // process and save current block:
+        res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
+        v_store_aligned(leftBuf+i, res);
+
+        //update src buffers:
+        src0_leftBuf = src1_leftBuf;
+        src1_leftBuf = src2;
+
+        //process topBuf:
+        //lookahead load:
+        src2 = v_load_aligned(topBuf+i+8);
+
+        //get shifted versions of the current block:
+        src_shifted_left  = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2));
+        src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2));
+
+        // replace shifted-in zeros with proper values and add P1:
+        src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg;
+        src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val       , 14)))+P1_reg;
+
+        // process and save current block:
+        res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        topMinCost_new_reg = v_min(topMinCost_new_reg,res);
+        v_store_aligned(topBuf+i, res);
+
+        //update src buffers:
+        src0_topBuf = src1_topBuf;
+        src1_topBuf = src2;
+    }
+
+    // a bit different processing for the last cycle of the loop:
+    //process leftBuf:
+    src_shifted_left  = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2));
+    src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2));
+
+    src2 = v_setall_s16(SHRT_MAX);
+
+    src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg;
+    src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val        , 14)))+P1_reg;
+
+    res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+    leftMinCost = min(v_min(leftMinCost_new_reg,res));
+    v_store_aligned(leftBuf+D-8, res);
+
+    //process topBuf:
+    src_shifted_left  = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2));
+    src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2));
+
+    src2 = v_setall_s16(SHRT_MAX);
+
+    src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg;
+    src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val       , 14)))+P1_reg;
+
+    res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+    topMinCost = min(v_min(topMinCost_new_reg,res));
+    v_store_aligned(topBuf+D-8, res);
+#else
+    CostType leftMinCost_new = SHRT_MAX;
+    CostType topMinCost_new  = SHRT_MAX;
+    int leftMinCost_P2  = leftMinCost + P2;
+    int topMinCost_P2   = topMinCost  + P2;
+    CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
+    CostType topBuf_i_minus_1       = SHRT_MAX;
+    CostType tmp;
+
+    for(int i=0;i<D-1;i++)
+    {
+        leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
+        leftBuf_prev_i_minus_1 = leftBuf_prev[i];
+        leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
+
+        tmp = topBuf[i];
+        topBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
+        topBuf_i_minus_1 = tmp;
+        topMinCost_new  = std::min(topMinCost_new,topBuf[i]);
+    }
+
+    leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
+    leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
+
+    topBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
+    topMinCost  = std::min(topMinCost_new,topBuf[D-1]);
+#endif
+}
+
+// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
+// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
+// optimal disparity value with minimum accumulated cost
+inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
+                                 CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
+{
+#if CV_SIMD128 && CV_SSE2
+    v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
+
+    v_int16x8 rightMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
+    v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
+    v_int16x8 src0_rightBuf        = v_setall_s16(SHRT_MAX);
+    v_int16x8 src1_rightBuf        = v_load(rightBuf);
+
+    v_int16x8 src2;
+    v_int16x8 src_shifted_left,src_shifted_right;
+    v_int16x8 res;
+
+    v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
+    v_int16x8 min_sum_pos_reg  = v_setall_s16(0);
+    v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
+    v_int16x8 eight_reg = v_setall_s16(8);
+
+    for(int i=0;i<D-8;i+=8)
+    {
+        //lookahead load:
+        src2 = v_load_aligned(rightBuf+i+8);
+
+        //get shifted versions of the current block:
+        src_shifted_left  = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2));
+        src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2));
+
+        // replace shifted-in zeros with proper values and add P1:
+        src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg;
+        src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val         , 14)))+P1_reg;
+
+        // process and save current block:
+        res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
+        v_store_aligned(rightBuf+i, res);
+
+        // compute and save total cost:
+        res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
+        v_store_aligned(leftBuf+i, res);
+
+        // track disparity value with the minimum cost:
+        min_sum_cost_reg = v_min(min_sum_cost_reg,res);
+        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
+        loop_idx = loop_idx+eight_reg;
+
+        //update src:
+        src0_rightBuf    = src1_rightBuf;
+        src1_rightBuf    = src2;
+    }
+
+    // a bit different processing for the last cycle of the loop:
+    src_shifted_left  = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2));
+    src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2));
+
+    src2 = v_setall_s16(SHRT_MAX);
+
+    src_shifted_left  = (src_shifted_left  | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg;
+    src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val         , 14)))+P1_reg;
+
+    res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+    rightMinCost = min(v_min(rightMinCost_new_reg,res));
+    v_store_aligned(rightBuf+D-8, res);
+
+    res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
+    v_store_aligned(leftBuf+D-8, res);
+
+    min_sum_cost_reg = v_min(min_sum_cost_reg,res);
+    min_cost = min(min_sum_cost_reg);
+    min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
+    optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
+#else
+    CostType rightMinCost_new = SHRT_MAX;
+    int rightMinCost_P2  = rightMinCost + P2;
+    CostType rightBuf_i_minus_1 = SHRT_MAX;
+    CostType tmp;
+    min_cost = SHRT_MAX;
+
+    for(int i=0;i<D-1;i++)
+    {
+        tmp = rightBuf[i];
+        rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
+        rightBuf_i_minus_1 = tmp;
+        rightMinCost_new  = std::min(rightMinCost_new,rightBuf[i]);
+        leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
+        if(leftBuf[i]<min_cost)
+        {
+            optimal_disp = i;
+            min_cost = leftBuf[i];
+        }
+    }
+
+    rightBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
+    rightMinCost  = std::min(rightMinCost_new,rightBuf[D-1]);
+    leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
+    if(leftBuf[D-1]<min_cost)
+    {
+        optimal_disp = D-1;
+        min_cost = leftBuf[D-1];
+    }
+#endif
+}
+
+void SGBM3WayMainLoop::operator () (const Range& range) const
+{
+    // force separate processing of stripes:
+    if(range.end>range.start+1)
+    {
+        for(int n=range.start;n<range.end;n++)
+            (*this)(Range(n,n+1));
+        return;
+    }
+
+    const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
+    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+
+    // setting up the ranges:
+    int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
+    int src_end_idx   = std::min(range.end   * stripe_sz, height);
+
+    int dst_offset;
+    if(range.start==0)
+        dst_offset=stripe_overlap;
+    else
+        dst_offset=0;
+
+    Mat cur_buffer = buffers [range.start];
+    Mat cur_disp   = dst_disp[range.start];
+    cur_disp = Scalar(INVALID_DISP_SCALED);
+
+    // prepare buffers:
+    CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
+    PixType* tmpBuf;
+    CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
+    short* disp2Buf;
+    getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
+                      curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
+                      vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
+
+    // start real processing:
+    for(int y=src_start_idx;y<src_end_idx;y++)
+    {
+        getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
+
+        short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
+
+        // initialize the auxiliary buffers for the pseudo left-right consistency check:
+        for(int x=0;x<width;x++)
+        {
+            disp2Buf[x] = (short)INVALID_DISP_SCALED;
+            disp2CostBuf[x] = SHRT_MAX;
+        }
+        CostType* C = curCostVolumeLine - D;
+        CostType prev_min, min_cost;
+        int d, best_d;
+        d = best_d = 0;
+
+        // forward pass
+        prev_min=0;
+        for (int x=D;x<(1+width1)*D;x+=D)
+            accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
+
+        //backward pass
+        memset(rightPassBuf,0,D*sizeof(CostType));
+        prev_min=0;
+        for (int x=width1*D;x>=D;x-=D)
+        {
+            accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
+
+            if(uniquenessRatio>0)
+            {
+#if CV_SIMD128
+                horPassCostVolume+=x;
+                int thresh = (100*min_cost)/(100-uniquenessRatio);
+                v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
+                v_int16x8 d1 = v_setall_s16((short)(best_d-1));
+                v_int16x8 d2 = v_setall_s16((short)(best_d+1));
+                v_int16x8 eight_reg = v_setall_s16(8);
+                v_int16x8 cur_d(0,1,2,3,4,5,6,7);
+                v_int16x8 mask,cost1,cost2;
+
+                for( d = 0; d < D; d+=16 )
+                {
+                    cost1 = v_load_aligned(horPassCostVolume+d);
+                    cost2 = v_load_aligned(horPassCostVolume+d+8);
+
+                    mask = cost1 < thresh_reg;
+                    mask = mask & ( (cur_d<d1) | (cur_d>d2) );
+                    if( v_signmask(mask) )
+                        break;
+
+                    cur_d = cur_d+eight_reg;
+
+                    mask = cost2 < thresh_reg;
+                    mask = mask & ( (cur_d<d1) | (cur_d>d2) );
+                    if( v_signmask(mask) )
+                        break;
+
+                    cur_d = cur_d+eight_reg;
+                }
+                horPassCostVolume-=x;
+#else
+                for( d = 0; d < D; d++ )
+                {
+                    if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
+                        break;
+                }
+#endif
+                if( d < D )
+                    continue;
+            }
+            d = best_d;
+
+            int _x2 = x/D - 1 + minX1 - d - minD;
+            if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
+            {
+                disp2CostBuf[_x2] = min_cost;
+                disp2Buf[_x2] = (short)(d + minD);
+            }
+
+            if( 0 < d && d < D-1 )
+            {
+                // do subpixel quadratic interpolation:
+                //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
+                //   then find minimum of the parabola.
+                int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
+                d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
+            }
+            else
+                d *= DISP_SCALE;
+
+            disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
+        }
+
+        for(int x = minX1; x < maxX1; x++ )
+        {
+            // pseudo LRC consistency check using only one disparity map;
+            // pixels with difference more than disp12MaxDiff are invalidated
+            int d1 = disp_row[x];
+            if( d1 == INVALID_DISP_SCALED )
+                continue;
+            int _d = d1 >> StereoMatcher::DISP_SHIFT;
+            int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
+            int _x = x - _d, x_ = x - d_;
+            if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
+                0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
+                disp_row[x] = (short)INVALID_DISP_SCALED;
+        }
+    }
+}
+
+static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
+                                      Mat& disp1, const StereoSGBMParams& params,
+                                      Mat* buffers, int nstripes )
+{
+    // precompute a lookup table for the raw matching cost computation:
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
+    PixType* clipTab = new PixType[TAB_SIZE];
+    int ftzero = std::max(params.preFilterCap, 15) | 1;
+    for(int k = 0; k < TAB_SIZE; k++ )
+        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+
+    // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
+    int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
+    int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
+    Mat* dst_disp = new Mat[nstripes];
+    for(int i=0;i<nstripes;i++)
+        dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
+
+    parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
+
+    //assemble disp1 from dst_disp:
+    short* dst_row;
+    short* src_row;
+    for(int i=0;i<disp1.rows;i++)
+    {
+        dst_row = (short*)disp1.ptr(i);
+        src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
+        memcpy(dst_row,src_row,disp1.cols*sizeof(short));
+    }
+
+    delete[] clipTab;
+    delete[] dst_disp;
+}
+
 class StereoSGBMImpl : public StereoSGBM
 {
 public:
@ -857,7 +1489,11 @@ public:
        disparr.create( left.size(), CV_16S );
        Mat disp = disparr.getMat();

-        computeDisparitySGBM( left, right, disp, params, buffer );
+        if(params.mode==MODE_SGBM_3WAY)
+            computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
+        else
+            computeDisparitySGBM( left, right, disp, params, buffer );
+
        medianBlur(disp, disp, 3);

        if( params.speckleWindowSize > 0 )
@ -933,6 +1569,12 @@ public:

    StereoSGBMParams params;
    Mat buffer;
+
+    // the number of stripes is fixed, disregarding the number of threads/processors
+    // to make the results fully reproducible:
+    static const int num_stripes = 4;
+    Mat buffers[num_stripes];
+
    static const char* name_;
 };

--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@ -742,7 +742,7 @@ protected:
    {
        int ndisp;
        int winSize;
-        bool fullDP;
+        int mode;
    };
    vector<RunParams> caseRunParams;

@ -757,7 +757,7 @@ protected:
            RunParams params;
            String ndisp = fn[i+2]; params.ndisp = atoi(ndisp.c_str());
            String winSize = fn[i+3]; params.winSize = atoi(winSize.c_str());
-            String fullDP = fn[i+4]; params.fullDP = atoi(fullDP.c_str()) == 0 ? false : true;
+            String mode = fn[i+4]; params.mode = atoi(mode.c_str());
            caseNames.push_back( caseName );
            caseDatasets.push_back( datasetName );
            caseRunParams.push_back( params );
@ -773,8 +773,7 @@ protected:
        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, params.ndisp, params.winSize,
                                                 10*params.winSize*params.winSize,
                                                 40*params.winSize*params.winSize,
-                                                 1, 63, 10, 100, 32, params.fullDP ?
-                                                 StereoSGBM::MODE_HH : StereoSGBM::MODE_SGBM );
+                                                 1, 63, 10, 100, 32, params.mode );
        sgbm->compute( leftImg, rightImg, leftDisp );
        CV_Assert( leftDisp.type() == CV_16SC1 );
        leftDisp/=16;
--- a/samples/cpp/stereo_match.cpp
+++ b/samples/cpp/stereo_match.cpp
@ -20,7 +20,7 @@ using namespace cv;
 static void print_help()
 {
    printf("\nDemo stereo matching converting L and R images into disparity and point clouds\n");
-    printf("\nUsage: stereo_match <left_image> <right_image> [--algorithm=bm|sgbm|hh] [--blocksize=<block_size>]\n"
+    printf("\nUsage: stereo_match <left_image> <right_image> [--algorithm=bm|sgbm|hh|sgbm3way] [--blocksize=<block_size>]\n"
           "[--max-disparity=<max_disparity>] [--scale=scale_factor>] [-i <intrinsic_filename>] [-e <extrinsic_filename>]\n"
           "[--no-display] [-o <disparity_image>] [-p <point_cloud_file>]\n");
 }
@ -61,7 +61,7 @@ int main(int argc, char** argv)
    const char* disparity_filename = 0;
    const char* point_cloud_filename = 0;

-    enum { STEREO_BM=0, STEREO_SGBM=1, STEREO_HH=2, STEREO_VAR=3 };
+    enum { STEREO_BM=0, STEREO_SGBM=1, STEREO_HH=2, STEREO_VAR=3, STEREO_3WAY=4 };
    int alg = STEREO_SGBM;
    int SADWindowSize = 0, numberOfDisparities = 0;
    bool no_display = false;
@ -85,7 +85,8 @@ int main(int argc, char** argv)
            alg = strcmp(_alg, "bm") == 0 ? STEREO_BM :
                  strcmp(_alg, "sgbm") == 0 ? STEREO_SGBM :
                  strcmp(_alg, "hh") == 0 ? STEREO_HH :
-                  strcmp(_alg, "var") == 0 ? STEREO_VAR : -1;
+                  strcmp(_alg, "var") == 0 ? STEREO_VAR :
+                  strcmp(_alg, "sgbm3way") == 0 ? STEREO_3WAY : -1;
            if( alg < 0 )
            {
                printf("Command-line parameter error: Unknown stereo algorithm\n\n");
@ -257,7 +258,12 @@ int main(int argc, char** argv)
    sgbm->setSpeckleWindowSize(100);
    sgbm->setSpeckleRange(32);
    sgbm->setDisp12MaxDiff(1);
-    sgbm->setMode(alg == STEREO_HH ? StereoSGBM::MODE_HH : StereoSGBM::MODE_SGBM);
+    if(alg==STEREO_HH)
+        sgbm->setMode(StereoSGBM::MODE_HH);
+    else if(alg==STEREO_SGBM)
+        sgbm->setMode(StereoSGBM::MODE_SGBM);
+    else if(alg==STEREO_3WAY)
+        sgbm->setMode(StereoSGBM::MODE_SGBM_3WAY);

    Mat disp, disp8;
    //Mat img1p, img2p, dispp;
@ -267,7 +273,7 @@ int main(int argc, char** argv)
    int64 t = getTickCount();
    if( alg == STEREO_BM )
        bm->compute(img1, img2, disp);
-    else if( alg == STEREO_SGBM || alg == STEREO_HH )
+    else if( alg == STEREO_SGBM || alg == STEREO_HH || alg == STEREO_3WAY )
        sgbm->compute(img1, img2, disp);
    t = getTickCount() - t;
    printf("Time elapsed: %fms\n", t*1000/getTickFrequency());