From 2918c3d75a34c307bc3993cb34189757d27b918a Mon Sep 17 00:00:00 2001
From: q <q@f4ekmah>
Date: Tue, 21 Mar 2017 14:51:19 +0000
Subject: [PATCH 1/6] First occurence of 4-directional version of SGBM. Even
 without any tests. Next step is parallelising it.

---
 modules/calib3d/include/opencv2/calib3d.hpp |   3 +-
 modules/calib3d/src/stereosgbm.cpp          | 798 +++++++++++++++++++-
 2 files changed, 763 insertions(+), 38 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 5a0e020d31..dd28d97e7f 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1809,7 +1809,8 @@ public:
     {
         MODE_SGBM = 0,
         MODE_HH   = 1,
-        MODE_SGBM_3WAY = 2
+        MODE_SGBM_3WAY = 2,
+        MODE_HH4  = 1
     };
 
     CV_WRAP virtual int getPreFilterCap() const = 0;
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 27ce62e6f9..bedc477e8b 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -301,9 +301,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     const CostType MAX_COST = SHRT_MAX;
 
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
+    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
+    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
+    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
@@ -313,11 +313,11 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
     bool fullDP = params.mode == StereoSGBM::MODE_HH;
-    int npasses = fullDP ? 2 : 1;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
+    int npasses = fullDP ? 2 : 1;                                           //4e: 2 passes with different behaviour for Hirshmueller method.
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
     PixType clipTab[TAB_SIZE];
 
-    for( k = 0; k < TAB_SIZE; k++ )
+    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     if( minX1 >= maxX1 )
@@ -330,25 +330,25 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
     // if you change NR, please, modify the loop as well.
-    int D2 = D+16, NRD2 = NR2*D2;
+    int D2 = D+16, NRD2 = NR2*D2;               //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
 
     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
     // for 8-way dynamic programming we need the current row and
     // the previous row, i.e. 2 rows in total
-    const int NLR = 2;
-    const int LrBorder = NLR - 1;
+    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
+    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
 
     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
-    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
+    size_t CSBufSize = costBufSize*(fullDP ? height : 1);                           //4e: For HH mode it's better to keep whole array of costs.
+    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;            //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
-    CSBufSize*2*sizeof(CostType) + // C, S
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
+    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
+    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
+    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 
     if( buffer.empty() || !buffer.isContinuous() ||
@@ -361,7 +361,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     CostType* hsumBuf = Sbuf + CSBufSize;
     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
+    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
     DispType* disp2ptr = (DispType*)(disp2cost + width);
     PixType* tempBuf = (PixType*)(disp2ptr + width);
 
@@ -373,21 +373,21 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     {
         int x1, y1, x2, y2, dx, dy;
 
-        if( pass == 1 )
+        if( pass == 1 )                     //4e: on the first pass, we work down, so calculate directions down, diagdown and right
         {
             y1 = 0; y2 = height; dy = 1;
             x1 = 0; x2 = width1; dx = 1;
         }
-        else
+        else                                //4e: on the second pass, we work up, so calculate directions up, diagup and up
         {
             y1 = height-1; y2 = -1; dy = -1;
             x1 = width1-1; x2 = -1; dx = -1;
         }
 
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};
+        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
 
-        for( k = 0; k < NLR; k++ )
-        {
+        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
+        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
             // and will occasionally use negative indices with the arrays
             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
@@ -408,28 +408,28 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
             {
-                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
+                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
 
                 for( k = dy1; k <= dy2; k++ )
                 {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
 
                     if( k < height )
                     {
                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 
                         memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )
+                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
                         {
                             int scale = x == 0 ? SW2 + 1 : 1;
                             for( d = 0; d < D; d++ )
                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
                         }
 
-                        if( y > 0 )
-                        {
+                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
+                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
 
                             for( x = D; x < width1*D; x += D )
                             {
@@ -465,8 +465,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-                            for( x = D; x < width1*D; x += D )
-                            {
+                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
+                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 
@@ -476,7 +476,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                     }
 
-                    if( y == 0 )
+                    if( y == 0 )        //4e: Calculating first full block sum.
                     {
                         int scale = k == 0 ? SH2 + 1 : 1;
                         for( x = 0; x < width1*D; x++ )
@@ -485,13 +485,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 }
 
                 // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )
+                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
                     S[k] = 0;
             }
 
             // clear the left and the right borders
-            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
-            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
+            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );       //4e: ...00000000|NRD2-16 of real costs value(and some of them are zeroes too)|00000000...
             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 
@@ -613,7 +613,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                     for( d = 0; d < D; d++ )
                     {
-                        int Cpd = Cp[d], L0, L1, L2, L3;
+                        int Cpd = Cp[d], L0, L1, L2, L3;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
 
                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
@@ -654,7 +654,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     CostType* Sp = S + x*D;
                     int minS = MAX_COST, bestDisp = -1;
 
-                    if( npasses == 1 )
+                    if( npasses == 1 )              //4e: in this case we could take fifth direction almost for free(direction "left")
                     {
                         int xm = x*NR2, xd = xm*D2;
 
@@ -804,7 +804,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
                 }
 
-                for( x = minX1; x < maxX1; x++ )
+                for( x = minX1; x < maxX1; x++ )        //4e: Left-right check itself
                 {
                     // we round the computed disparity both towards -inf and +inf and check
                     // if either of the corresponding disparities in disp2 is consistent.
@@ -815,8 +815,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     int _d = d1 >> DISP_SHIFT;
                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
                     int _x = x - _d, x_ = x - d_;
-                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
-                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
+                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
                 }
             }
@@ -828,8 +828,732 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+ This is new experimential version of disparity calculation, which should be parralled after
+TODO: Don't forget to rewrire this commentaries after
+
+ computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
+ that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
+ minD <= d < maxD.
+ disp2full is the reverse disparity map, that is:
+ disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
+
+ note that disp1buf will have the same size as the roi and
+ disp2full will have the same size as img1 (or img2).
+ On exit disp2buf is not the final disparity, it is an intermediate result that becomes
+ final after all the tiles are processed.
+
+ the disparity in disp1buf is written with sub-pixel accuracy
+ (4 fractional bits, see StereoSGBM::DISP_SCALE),
+ using quadratic interpolation, while the disparity in disp2buf
+ is written as is, without interpolation.
+
+ disp2cost also has the same size as img1 (or img2).
+ It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
+ */
+static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
+                                 Mat& disp1, const StereoSGBMParams& params,
+                                 Mat& buffer )
+{
+//#if CV_SIMD128
+//    // maxDisparity is supposed to multiple of 16, so we can forget doing else
+//    static const uchar LSBTab[] =
+//    {
+//        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+//    };
+//    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+//
+//    bool useSIMD = hasSIMD128();
+//#endif
+
+    const int ALIGN = 16;
+    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
+    const int DISP_SCALE = (1 << DISP_SHIFT);
+    const CostType MAX_COST = SHRT_MAX;
+
+    int minD = params.minDisparity, maxD = minD + params.numDisparities;
+    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
+    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
+    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
+    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+    int k, width = disp1.cols, height = disp1.rows;
+    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
+    int D = maxD - minD, width1 = maxX1 - minX1;
+    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
+    PixType clipTab[TAB_SIZE];
+
+    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
+        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+
+    if( minX1 >= maxX1 )
+    {
+        disp1 = Scalar::all(INVALID_DISP_SCALED);
+        return;
+    }
+
+    CV_Assert( D % 16 == 0 );                    //TODO: Are you sure? By the way, why not 8?
+
+    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
+    // if you change NR, please, modify the loop as well.
+    int D2 = D+16;                  //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
+
+    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+    // for 8-way dynamic programming we need the current row and
+    // the previous row, i.e. 2 rows in total
+    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
+    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
+
+    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
+    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+    size_t costBufSize = width1*D;
+    size_t CSBufSize = costBufSize*height;                                            //4e: For HH mode it's better to keep whole array of costs.
+    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;                  //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
+    int hsumBufNRows = SH2*2 + 2;
+    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
+    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
+    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
+    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
+    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+
+    if( buffer.empty() || !buffer.isContinuous() ||
+        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
+        buffer.create(1, (int)totalBufSize, CV_8U);
+
+    // summary cost over different (nDirs) directions
+    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+    CostType* Sbuf = Cbuf + CSBufSize;
+    CostType* hsumBuf = Sbuf + CSBufSize;
+    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
+
+    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
+    DispType* disp2ptr = (DispType*)(disp2cost + width);
+    PixType* tempBuf = (PixType*)(disp2ptr + width);
+
+    // add P2 to every C(x,y). it saves a few operations in the inner loops
+    for(k = 0; k < (int)CSBufSize; k++ )
+        Cbuf[k] = (CostType)P2;
+
+    // Two vertical passes - up to down and down to up
+    for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
+    {
+        int y1, y2, dy;
+
+        if( pass == 1 )
+        {
+            y1 = 0; y2 = height; dy = 1;
+        }
+        else
+        {
+            y1 = height-1; y2 = -1; dy = -1;
+        }
+
+        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+
+        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
+        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+            // and will occasionally use negative indices with the arrays
+            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+            // however, then the alignment will be imperfect, i.e. bad for SSE,
+            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+            Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8;
+            memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
+            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder;
+            memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) );
+        }
+
+        for( int y = y1; y != y2; y += dy )
+        {
+            int x, d;
+            CostType* C = Cbuf + y*costBufSize;
+            CostType* S = Sbuf + y*costBufSize;
+
+            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+            {
+                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
+
+                for( k = dy1; k <= dy2; k++ )
+                {
+                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
+
+                    if( k < height )
+                    {
+                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+
+                        memset(hsumAdd, 0, D*sizeof(CostType));
+                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
+                        {
+                            int scale = x == 0 ? SW2 + 1 : 1;
+                            for( d = 0; d < D; d++ )
+                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+                        }
+
+                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
+                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
+                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
+
+                            for( x = D; x < width1*D; x += D )
+                            {
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+//                            #if CV_SIMD128
+//                                if( useSIMD )
+//                                {
+//                                    for( d = 0; d < D; d += 8 )
+//                                    {
+//                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
+//                                        v_int16x8 Cx = v_load(Cprev + x + d);
+//                                        v_int16x8 psub = v_load(pixSub + d);
+//                                        v_int16x8 padd = v_load(pixAdd + d);
+//                                        hv = (hv - psub + padd);
+//                                        psub = v_load(hsumSub + x + d);
+//                                        Cx = Cx - psub + hv;
+//                                        v_store(hsumAdd + x + d, hv);
+//                                        v_store(C + x + d, Cx);
+//                                    }
+//                                }
+//                                else
+//                            #endif
+                                {
+                                    for( d = 0; d < D; d++ )
+                                    {
+                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
+                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+                                for( d = 0; d < D; d++ )
+                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                            }
+                        }
+                    }
+
+                    if( y == 0 )        //4e: Calculating first full block sum.
+                    {
+                        int scale = k == 0 ? SH2 + 1 : 1;
+                        for( x = 0; x < width1*D; x++ )
+                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+                    }
+                }
+
+                // also, clear the S buffer
+                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
+                    S[k] = 0;
+            }
+
+            // clear the left and the right borders
+            memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
+            memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
+            memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) );
+            memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) );
+
+            /*
+             [formula 13 in the paper]
+             compute L_r(p, d) = C(p, d) +
+             min(L_r(p-r, d),
+             L_r(p-r, d-1) + P1,
+             L_r(p-r, d+1) + P1,
+             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+             where p = (x,y), r is one of the directions.
+             we process all the directions at once:
+             0: r=(-dx, 0)
+             1: r=(-1, -dy)
+             2: r=(0, -dy)
+             3: r=(1, -dy)
+             4: r=(-2, -dy)
+             5: r=(-1, -dy*2)
+             6: r=(1, -dy*2)
+             7: r=(2, -dy)
+             */
+            for( x = 0; x != width; x++ )
+            {
+                int xd = x*D2;
+
+                int delta = minLr[1][x];
+
+                CostType* Lr_ppr = Lr[1] + xd;
+
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                CostType* Lr_p = Lr[0] + xd;
+                const CostType* Cp = C + x*D;
+                CostType* Sp = S + x*D;
+
+//            #if CV_SIMD128
+//                if( useSIMD )
+//                {
+//                    v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                    for( d = 0; d < D; d += 8 )
+//                    {
+//                        v_int16x8 Cpd = v_load(Cp + d);
+//                        v_int16x8 L0, L1, L2, L3;
+//
+//                        L0 = v_load(Lr_p0 + d);
+//                        L1 = v_load(Lr_p1 + d);
+//                        L2 = v_load(Lr_ppr + d);
+//                        L3 = v_load(Lr_p3 + d);
+//
+//                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
+//
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                        L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                        L0 = v_min(L0, _delta0);
+//                        L0 = ((L0 - _delta0) + Cpd);
+//
+//                        L1 = v_min(L1, _delta1);
+//                        L1 = ((L1 - _delta1) + Cpd);
+//
+//                        L2 = v_min(L2, _delta2);
+//                        L2 = ((L2 - _delta2) + Cpd);
+//
+//                        L3 = v_min(L3, _delta3);
+//                        L3 = ((L3 - _delta3) + Cpd);
+//
+//                        v_store(Lr_p + d, L0);
+//                        v_store(Lr_p + d + D2, L1);
+//                        v_store(Lr_p + d + D2*2, L2);
+//                        v_store(Lr_p + d + D2*3, L3);
+//
+//                        // Get minimum from in L0-L3
+//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                        v_int16x8 t0 = v_min(t0123L, t0123H);
+//                        _minL0 = v_min(_minL0, t0);
+//
+//                        v_int16x8 Sval = v_load(Sp + d);
+//
+//                        L0 = L0 + L1;
+//                        L2 = L2 + L3;
+//                        Sval = Sval + L0;
+//                        Sval = Sval + L2;
+//
+//                        v_store(Sp + d, Sval);
+//                    }
+//
+//                    v_int32x4 minL, minH;
+//                    v_expand(_minL0, minL, minH);
+//                    v_pack_store(&minLr[0][x], v_min(minL, minH));
+//                }
+//                else
+//            #endif
+                {
+                    int minL = MAX_COST;
+
+                    for( d = 0; d < D; d++ )
+                    {
+                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                        Lr_p[d] = (CostType)L;
+                        minL0 = std::min(minL, L);
+
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+                    }
+                    minLr[0][x] = (CostType)minL;
+                }
+            }
+
+            // now shift the cyclic buffers
+            std::swap( Lr[0], Lr[1] );
+            std::swap( minLr[0], minLr[1] );
+        }
+    }
+
+//    for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
+    {
+
+        CostType *Lr, *minLr;
+
+        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+            // and will occasionally use negative indices with the arrays
+            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+            // however, then the alignment will be imperfect, i.e. bad for SSE,
+            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+            Lr = pixDiff + costBufSize + D2*LrBorder + 8;
+            memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
+            minLr = pixDiff + costBufSize + LrSize*NLR + LrBorder;
+            memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) );
+        }
+
+        for( int y = 0; y != height; y++)
+        {
+            int x, d;
+            DispType* disp1ptr = disp1.ptr<DispType>(y);
+            CostType* C = Cbuf + costBufSize;
+            CostType* S = Sbuf + costBufSize;
+
+            for( x = 0; x < width; x++ )
+            {
+                disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+                disp2cost[x] = MAX_COST;
+            }
+
+            // clear the left and the right borders                             //TODO: Well, two of that memsets we could delete, but rest of them is direction-dependent
+            memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
+            memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
+            memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
+            memset( minLr + width1, 0, LrBorder*sizeof(CostType) );
+
+            /*
+             [formula 13 in the paper]
+             compute L_r(p, d) = C(p, d) +
+             min(L_r(p-r, d),
+             L_r(p-r, d-1) + P1,
+             L_r(p-r, d+1) + P1,
+             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+             where p = (x,y), r is one of the directions.
+             we process all the directions at once:
+             0: r=(-dx, 0)
+             1: r=(-1, -dy)
+             2: r=(0, -dy)
+             3: r=(1, -dy)
+             4: r=(-2, -dy)
+             5: r=(-1, -dy*2)
+             6: r=(1, -dy*2)
+             7: r=(2, -dy)
+             */
+            for( x = 0; x != width1; x++)
+            {
+                int xd = x*D2;
+
+                int delta = minLr[x - 1];
+
+                CostType* Lr_ppr = Lr + xd - D2;
+
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                CostType* Lr_p = Lr + xd;
+                const CostType* Cp = C + x*D;
+                CostType* Sp = S + x*D;
+
+//            #if CV_SIMD128
+//                if( useSIMD )
+//                {
+//                    v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                    for( d = 0; d < D; d += 8 )
+//                    {
+//                        v_int16x8 Cpd = v_load(Cp + d);
+//                        v_int16x8 L0, L1, L2, L3;
+//
+//                        L0 = v_load(Lr_ppr + d);
+//                        L1 = v_load(Lr_p1 + d);
+//                        L2 = v_load(Lr_p2 + d);
+//                        L3 = v_load(Lr_p3 + d);
+//
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
+//
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                        L0 = v_min(L0, _delta0);
+//                        L0 = ((L0 - _delta0) + Cpd);
+//
+//                        L1 = v_min(L1, _delta1);
+//                        L1 = ((L1 - _delta1) + Cpd);
+//
+//                        L2 = v_min(L2, _delta2);
+//                        L2 = ((L2 - _delta2) + Cpd);
+//
+//                        L3 = v_min(L3, _delta3);
+//                        L3 = ((L3 - _delta3) + Cpd);
+//
+//                        v_store(Lr_p + d, L0);
+//                        v_store(Lr_p + d + D2, L1);
+//                        v_store(Lr_p + d + D2*2, L2);
+//                        v_store(Lr_p + d + D2*3, L3);
+//
+//                        // Get minimum from in L0-L3
+//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                        v_int16x8 t0 = v_min(t0123L, t0123H);
+//                        _minL0 = v_min(_minL0, t0);
+//
+//                        v_int16x8 Sval = v_load(Sp + d);
+//
+//                        L0 = L0 + L1;
+//                        L2 = L2 + L3;
+//                        Sval = Sval + L0;
+//                        Sval = Sval + L2;
+//
+//                        v_store(Sp + d, Sval);
+//                    }
+//
+//                    v_int32x4 minL, minH;
+//                    v_expand(_minL0, minL, minH);
+//                    v_pack_store(&minLr[x], v_min(minL, minH));
+//                }
+//                else
+//            #endif
+                {
+                    int minL = MAX_COST;
+
+                    for( d = 0; d < D; d++ )
+                    {
+                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                        L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                        Lr_p[d] = (CostType)L;
+                        minL0 = std::min(minL, L);
+
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                    }
+                    minLr[x] = (CostType)minL0;
+                }
+            }
+
+            for( x = width1-1; x != -1; x--)
+            {
+                int xd = x*D2;
+
+                int delta = minLr[x + 1];
+
+                CostType* Lr_ppr = Lr + xd + D2;
+
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                CostType* Lr_p = Lr + xd;
+                const CostType* Cp = C + x*D;
+                CostType* Sp = S + x*D;
+                int minS = MAX_COST, bestDisp = -1;
+
+//            #if CV_SIMD128
+//                if( useSIMD )
+//                {
+//                    v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                    for( d = 0; d < D; d += 8 )
+//                    {
+//                        v_int16x8 Cpd = v_load(Cp + d);
+//                        v_int16x8 L0, L1, L2, L3;
+//
+//                        L0 = v_load(Lr_ppr + d);
+//                        L1 = v_load(Lr_p1 + d);
+//                        L2 = v_load(Lr_p2 + d);
+//                        L3 = v_load(Lr_p3 + d);
+//
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
+//
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                        L0 = v_min(L0, _delta0);
+//                        L0 = ((L0 - _delta0) + Cpd);
+//
+//                        L1 = v_min(L1, _delta1);
+//                        L1 = ((L1 - _delta1) + Cpd);
+//
+//                        L2 = v_min(L2, _delta2);
+//                        L2 = ((L2 - _delta2) + Cpd);
+//
+//                        L3 = v_min(L3, _delta3);
+//                        L3 = ((L3 - _delta3) + Cpd);
+//
+//                        v_store(Lr_p + d, L0);
+//                        v_store(Lr_p + d + D2, L1);
+//                        v_store(Lr_p + d + D2*2, L2);
+//                        v_store(Lr_p + d + D2*3, L3);
+//
+//                        // Get minimum from in L0-L3
+//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                        v_int16x8 t0 = v_min(t0123L, t0123H);
+//                        _minL0 = v_min(_minL0, t0);
+//
+//                        v_int16x8 Sval = v_load(Sp + d);
+//
+//                        L0 = L0 + L1;
+//                        L2 = L2 + L3;
+//                        Sval = Sval + L0;
+//                        Sval = Sval + L2;
+//
+//                        v_store(Sp + d, Sval);
+//                    }
+//
+//                    v_int32x4 minL, minH;
+//                    v_expand(_minL0, minL, minH);
+//                    v_pack_store(&minLr[x], v_min(minL, minH));
+//                }
+//                else
+//            #endif
+//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
+//                #if CV_SIMD128
+//                    if( useSIMD )
+//                    {
+//                        v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
+//                        v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
+//
+//                        for( d = 0; d < D; d+= 8 )
+//                        {
+//                            v_int16x8 L0 = v_load(Sp + d);
+//                            v_int16x8 mask = L0 < _minS;
+//                            _minS = v_min( L0, _minS );
+//                            _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
+//                            _d8 = _d8 + _8;
+//                        }
+//                        v_int32x4 _d0, _d1;
+//                        v_expand(_minS, _d0, _d1);
+//                        minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+//                        v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
+//
+//                        _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
+//                        v_expand(_bestDisp, _d0, _d1);
+//                        bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+//                    }
+//                    else
+//                #endif
+                {
+                    int minL = MAX_COST;
+
+                    for( d = 0; d < D; d++ )
+                    {
+                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                        L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                        Lr_p[d] = (CostType)L;
+                        minL0 = std::min(minL, L);
+
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                        if( Sp[d] < minS )
+                        {
+                            minS = Sp[d];
+                            bestDisp = d;
+                        }
+                    }
+                    minLr[x] = (CostType)minL0;
+                }
+                //Some postprocessing procedures and saving
+                for( d = 0; d < D; d++ )
+                {
+                    if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
+                        break;
+                }
+                if( d < D )
+                    continue;
+                d = bestDisp;
+                int _x2 = x + minX1 - d - minD;
+                if( disp2cost[_x2] > minS )
+                {
+                    disp2cost[_x2] = (CostType)minS;
+                    disp2ptr[_x2] = (DispType)(d + minD);
+                }
+
+                if( 0 < d && d < D-1 )
+                {
+                    // do subpixel quadratic interpolation:
+                    //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
+                    //   then find minimum of the parabola.
+                    int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
+                    d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
+                }
+                else
+                    d *= DISP_SCALE;
+                disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
+            }
+            //Left-right check sanity procedure
+            for( x = minX1; x < maxX1; x++ )
+            {
+                // we round the computed disparity both towards -inf and +inf and check
+                // if either of the corresponding disparities in disp2 is consistent.
+                // This is to give the computed disparity a chance to look valid if it is.
+                int d1 = disp1ptr[x];
+                if( d1 == INVALID_DISP_SCALED )
+                    continue;
+                int _d = d1 >> DISP_SHIFT;
+                int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
+                int _x = x - _d, x_ = x - d_;
+                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
+                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
+                    disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
+            }
+        }
+    }
+}
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
+
 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
                        PixType*& tmpBuf, CostType*& horPassCostVolume,

From 106413a3ea1c29a5561465d843e1a5055c07eeaf Mon Sep 17 00:00:00 2001
From: Pyotr Chekmaryov <4ekmah@gmail.com>
Date: Mon, 27 Mar 2017 21:40:17 +0000
Subject: [PATCH 2/6] Simplest test added and code debuged.

---
 modules/calib3d/include/opencv2/calib3d.hpp  |  2 +-
 modules/calib3d/src/stereosgbm.cpp           | 36 +++++++++++---------
 modules/calib3d/test/test_stereomatching.cpp | 30 ++++++++++++++++
 3 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index dd28d97e7f..46a470bc0f 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1810,7 +1810,7 @@ public:
         MODE_SGBM = 0,
         MODE_HH   = 1,
         MODE_SGBM_3WAY = 2,
-        MODE_HH4  = 1
+        MODE_HH4  = 3
     };
 
     CV_WRAP virtual int getPreFilterCap() const = 0;
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index bedc477e8b..24b3c18ddc 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -513,6 +513,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
              6: r=(1, -dy*2)
              7: r=(2, -dy)
              */
+
             for( x = x1; x != x2; x += dx )
             {
                 int xm = x*NR2, xd = xm*D2;
@@ -853,6 +854,7 @@ TODO: Don't forget to rewrire this commentaries after
  disp2cost also has the same size as img1 (or img2).
  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
+#include<stdio.h>  //TODO: DUBUG!!!
 static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
@@ -1005,7 +1007,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                         if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
                         {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
+                            const CostType* Cprev = C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
 
                             for( x = D; x < width1*D; x += D )
                             {
@@ -1089,11 +1091,11 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
              6: r=(1, -dy*2)
              7: r=(2, -dy)
              */
-            for( x = 0; x != width; x++ )
+            for( x = 0; x != width1; x++ )
             {
                 int xd = x*D2;
 
-                int delta = minLr[1][x];
+                int delta = minLr[1][x] + P2;
 
                 CostType* Lr_ppr = Lr[1] + xd;
 
@@ -1189,9 +1191,9 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                         L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
                         Lr_p[d] = (CostType)L;
-                        minL0 = std::min(minL, L);
+                        minL = std::min(minL, L);
 
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
                     minLr[0][x] = (CostType)minL;
                 }
@@ -1224,8 +1226,8 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + costBufSize;
-            CostType* S = Sbuf + costBufSize;
+            CostType* C = Cbuf + y*costBufSize;
+            CostType* S = Sbuf + y*costBufSize;
 
             for( x = 0; x < width; x++ )
             {
@@ -1233,7 +1235,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                 disp2cost[x] = MAX_COST;
             }
 
-            // clear the left and the right borders                             //TODO: Well, two of that memsets we could delete, but rest of them is direction-dependent
+            // clear the left and the right borders
             memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
             memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
             memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
@@ -1261,7 +1263,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
             {
                 int xd = x*D2;
 
-                int delta = minLr[x - 1];
+                int delta = minLr[x - 1] + P2;
 
                 CostType* Lr_ppr = Lr + xd - D2;
 
@@ -1354,14 +1356,14 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                     {
                         int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
 
-                        L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
                         Lr_p[d] = (CostType)L;
-                        minL0 = std::min(minL, L);
+                        minL = std::min(minL, L);
 
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
-                    minLr[x] = (CostType)minL0;
+                    minLr[x] = (CostType)minL;
                 }
             }
 
@@ -1369,7 +1371,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
             {
                 int xd = x*D2;
 
-                int delta = minLr[x + 1];
+                int delta = minLr[x + 1] + P2;
 
                 CostType* Lr_ppr = Lr + xd + D2;
 
@@ -1489,10 +1491,10 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                     {
                         int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
 
-                        L0 = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
                         Lr_p[d] = (CostType)L;
-                        minL0 = std::min(minL, L);
+                        minL = std::min(minL, L);
 
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                         if( Sp[d] < minS )
@@ -1501,7 +1503,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                             bestDisp = d;
                         }
                     }
-                    minLr[x] = (CostType)minL0;
+                    minLr[x] = (CostType)minL;
                 }
                 //Some postprocessing procedures and saving
                 for( d = 0; d < D; d++ )
@@ -2206,6 +2208,8 @@ public:
 
         if(params.mode==MODE_SGBM_3WAY)
             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
+        else if(params.mode==MODE_HH4)
+            computeDisparitySGBMParallel( left, right, disp, params, buffer );
         else
             computeDisparitySGBM( left, right, disp, params, buffer );
 
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index 0aee42acee..c51c419334 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -784,3 +784,33 @@ protected:
 
 TEST(Calib3d_StereoBM, regression) { CV_StereoBMTest test; test.safe_run(); }
 TEST(Calib3d_StereoSGBM, regression) { CV_StereoSGBMTest test; test.safe_run(); }
+
+TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere)
+{
+//	<!-- caseName, datasetName, numDisp, winSize, mode -->
+//	case_teddy_2 teddy "48" "3" "MODE_HH"
+
+//Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
+//                                 int P1, int P2, int disp12MaxDiff,
+//                                 int preFilterCap, int uniquenessRatio,
+//                                 int speckleWindowSize, int speckleRange,
+//                                 int mode)
+    Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png");
+    Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png");
+    {
+        Mat leftDisp;
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH);
+        sgbm->compute( leftImg, rightImg, leftDisp );
+        CV_Assert( leftDisp.type() == CV_16SC1 );
+        leftDisp/=8;
+        imwrite( "/home/q/Work/GitVault/modehh.jpg", leftDisp);
+    }
+    {
+        Mat leftDisp;
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
+        sgbm->compute( leftImg, rightImg, leftDisp );
+        CV_Assert( leftDisp.type() == CV_16SC1 );
+        leftDisp/=8;
+        imwrite( "/home/q/Work/GitVault/modehh4.jpg", leftDisp);
+    }
+}

From 20036b82d39bac0a7cd45d6bfc2af3be41d930a2 Mon Sep 17 00:00:00 2001
From: Pyotr Chekmaryov <4ekmah@gmail.com>
Date: Sat, 15 Apr 2017 21:51:15 +0000
Subject: [PATCH 3/6] There added parallel realization of vertical passes for
 MODE_HH4.

---
 modules/calib3d/include/opencv2/calib3d.hpp  |    3 +-
 modules/calib3d/src/stereosgbm.cpp           | 1084 +++++++++++++++---
 modules/calib3d/test/test_stereomatching.cpp |   25 +-
 3 files changed, 964 insertions(+), 148 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 46a470bc0f..f2915850f9 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1810,7 +1810,8 @@ public:
         MODE_SGBM = 0,
         MODE_HH   = 1,
         MODE_SGBM_3WAY = 2,
-        MODE_HH4  = 3
+        MODE_HH4  = 3,
+        MODE_HH4_OLD  = 4
     };
 
     CV_WRAP virtual int getPreFilterCap() const = 0;
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 24b3c18ddc..8a6bfebc67 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -110,6 +110,7 @@ struct StereoSGBMParams
     int mode;
 };
 
+static const int DEFAULT_RIGHT_BORDER = -1;
 /*
  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
  and for each disparity minD<=d<maxD the function
@@ -123,12 +124,21 @@ struct StereoSGBMParams
 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                             int minD, int maxD, CostType* cost,
                             PixType* buffer, const PixType* tab,
-                            int tabOfs, int )
+                            int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
+    //TODO: This function was changed and modified old function's behabior. Check they in tests
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
+    int D = maxD - minD, width1 = maxX1 - minX1;
+    //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
+    //That is needs of parallel algorithm
+    xrange_min = (xrange_min < 0) ? 0: xrange_min;
+    xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;
+    maxX1 = minX1 + xrange_max;
+    minX1 += xrange_min;
+    width1 = maxX1 - minX1;
     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
-    int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
+    int width2 = maxX2 - minX2;
     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 #if CV_SIMD128
@@ -179,10 +189,10 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
         }
     }
 
-    memset( cost, 0, width1*D*sizeof(cost[0]) );
+    memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
 
-    buffer -= minX2;
-    cost -= minX1*D + minD; // simplify the cost indices inside the loop
+    buffer -= width-1-maxX2;
+    cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
 
     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
     {
@@ -191,7 +201,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
         // precompute
         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
-        for( x = minX2; x < maxX2; x++ )
+        for( x = width-1-maxX2; x < width-1- minX2; x++ )
         {
             int v = prow2[x];
             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
@@ -830,6 +840,335 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////
+//TODO: Assumation: Let's pretend, that we allocate memory for pixDiff and tempBuf independently in each thread, with full size, needed for original calcBT
+//TODO: Redo size of this arrays even if situation with independent allocation will still.
+struct CalcVerticalSums: public ParallelLoopBody
+{
+    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
+                     CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
+    {
+        minD = params.minDisparity;
+        maxD = minD + params.numDisparities;
+        SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
+        ftzero = std::max(params.preFilterCap, 15) | 1;
+        P1 = params.P1 > 0 ? params.P1 : 2;
+        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+        height = img1.rows;
+        width = img1.cols;
+        int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
+        D = maxD - minD;
+        width1 = maxX1 - minX1;
+        D2 = D + 16;
+        costBufSize = width1*D;
+        CSBufSize = costBufSize*height;
+        minLrSize = (width1 + LrBorder*2);
+        LrSize = minLrSize*D2;
+        hsumBufNRows = SH2*2 + 2;
+        Cbuf = alignedBuf;
+        Sbuf = Cbuf + CSBufSize;
+        hsumBuf = Sbuf + CSBufSize;
+    }
+
+    void operator()( const Range& range ) const
+    {
+        static const CostType MAX_COST = SHRT_MAX;
+        static const int ALIGN = 16;
+        static const int TAB_OFS = 256*4;
+        int x1 = range.start, x2 = range.end, k;
+        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
+        size_t auxBufsSize = pixDiffSize*sizeof(CostType) +                 //pixdiff size
+                             width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf //TODO: Probably it's better 6 instead of 16(alignment?)
+        Mat auxBuff;
+        auxBuff.create(1, (int)auxBufsSize, CV_8U);
+        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
+        PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
+
+        // Simplification of index calculation
+        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
+
+        for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
+        {
+            int y1, y2, dy;
+
+            if( pass == 1 )
+            {
+                y1 = 0; y2 = height; dy = 1;
+            }
+            else
+            {
+                y1 = height-1; y2 = -1; dy = -1;
+            }
+
+            CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+
+            for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
+            {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+                // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+                // and will occasionally use negative indices with the arrays
+                // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+                // however, then the alignment will be imperfect, i.e. bad for SSE,
+                // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + D2*LrBorder + 8;
+                memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
+                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k + LrBorder;
+                memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
+            }
+
+            for( int y = y1; y != y2; y += dy )
+            {
+                int x, d;
+                CostType* C = Cbuf + y*costBufSize;
+                CostType* S = Sbuf + y*costBufSize;
+
+                if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+                {
+                    int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
+
+                    for( k = dy1; k <= dy2; k++ )
+                    {
+                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
+
+                        if( k < height )
+                        {
+                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
+
+                            memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
+                            for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
+                            {
+                                int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
+                                for( d = 0; d < D; d++ )
+                                    hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
+                            }
+
+                            if( y > 0 )             //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
+                            {                       //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
+                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                                const CostType* Cprev = C - costBufSize;
+
+                                // We need to calculate C[x1] in different way, because hsumadd is already calculated
+                                // We don't doing then for x==0, because original function has forgotten to do this  //TODO: Check: does this original still exist?
+                                if(x1!=0)
+                                {
+                                    for( d = 0; d < D; d++ )
+                                        C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
+                                }
+
+                                for( x = (x1+1)*D; x < x2*D; x += D )
+                                {
+                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+//                                #if CV_SIMD128
+//                                    if( useSIMD )
+//                                    {
+//                                        for( d = 0; d < D; d += 8 )
+//                                        {
+//                                            v_int16x8 hv = v_load(hsumAdd + x - D + d);
+//                                            v_int16x8 Cx = v_load(Cprev + x + d);
+//                                            v_int16x8 psub = v_load(pixSub + d);
+//                                            v_int16x8 padd = v_load(pixAdd + d);
+//                                            hv = (hv - psub + padd);
+//                                            psub = v_load(hsumSub + x + d);
+//                                            Cx = Cx - psub + hv;
+//                                            v_store(hsumAdd + x + d, hv);
+//                                            v_store(C + x + d, Cx);
+//                                        }
+//                                    }
+//                                    else
+//                                #endif
+                                    {
+                                        for( d = 0; d < D; d++ )
+                                        {
+                                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                            C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
+                                        }
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for( x = (x1+1)*D; x < x2*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
+                                {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
+                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+                                    for( d = 0; d < D; d++ )
+                                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                }
+                            }
+                            // Return to coordinates, which is needed by CalcCostBT
+                        }
+
+                        if( y == 0 )        //4e: Calculating first full block sum.
+                        {
+                            int scale = k == 0 ? SH2 + 1 : 1;
+                            for( x = x1*D; x < x2*D; x++ )
+                                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+                        }
+                    }
+
+                    // also, clear the S buffer
+                    for( k = x1*D; k < x2*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
+                        S[k] = 0;
+                }
+
+//                 [formula 13 in the paper]
+//                 compute L_r(p, d) = C(p, d) +
+//                 min(L_r(p-r, d),
+//                 L_r(p-r, d-1) + P1,
+//                 L_r(p-r, d+1) + P1,
+//                 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+//                 where p = (x,y), r is one of the directions.
+//                 we process all the directions at once:
+//                 0: r=(-dx, 0)
+//                 1: r=(-1, -dy)
+//                 2: r=(0, -dy)
+//                 3: r=(1, -dy)
+//                 4: r=(-2, -dy)
+//                 5: r=(-1, -dy*2)
+//                 6: r=(1, -dy*2)
+//                 7: r=(2, -dy)
+
+                for( x = x1; x != x2; x++ )
+                {
+                    int xd = x*D2;
+
+                    int delta = minLr[1][x] + P2;
+
+                    CostType* Lr_ppr = Lr[1] + xd;
+
+                    Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                    CostType* Lr_p = Lr[0] + xd;
+                    const CostType* Cp = C + x*D;
+                    CostType* Sp = S + x*D;
+
+//                #if CV_SIMD128
+//                    if( useSIMD )
+//                    {
+//                        v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                        v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                        v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                        v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                        v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                        v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                        for( d = 0; d < D; d += 8 )
+//                        {
+//                            v_int16x8 Cpd = v_load(Cp + d);
+//                            v_int16x8 L0, L1, L2, L3;
+//
+//                            L0 = v_load(Lr_p0 + d);
+//                            L1 = v_load(Lr_p1 + d);
+//                            L2 = v_load(Lr_ppr + d);
+//                            L3 = v_load(Lr_p3 + d);
+//
+//                            L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
+//                            L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
+//
+//                            L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                            L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                            L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
+//                            L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                            L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                            L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                            L0 = v_min(L0, _delta0);
+//                            L0 = ((L0 - _delta0) + Cpd);
+//
+//                            L1 = v_min(L1, _delta1);
+//                            L1 = ((L1 - _delta1) + Cpd);
+//
+//                            L2 = v_min(L2, _delta2);
+//                            L2 = ((L2 - _delta2) + Cpd);
+//
+//                            L3 = v_min(L3, _delta3);
+//                            L3 = ((L3 - _delta3) + Cpd);
+//
+//                            v_store(Lr_p + d, L0);
+//                            v_store(Lr_p + d + D2, L1);
+//                            v_store(Lr_p + d + D2*2, L2);
+//                            v_store(Lr_p + d + D2*3, L3);
+//
+//                            // Get minimum from in L0-L3
+//                            v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                            v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                            v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                            v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                            v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                            v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                            v_int16x8 t0 = v_min(t0123L, t0123H);
+//                            _minL0 = v_min(_minL0, t0);
+//
+//                            v_int16x8 Sval = v_load(Sp + d);
+//
+//                            L0 = L0 + L1;
+//                            L2 = L2 + L3;
+//                            Sval = Sval + L0;
+//                            Sval = Sval + L2;
+//
+//                            v_store(Sp + d, Sval);
+//                        }
+//
+//                        v_int32x4 minL, minH;
+//                        v_expand(_minL0, minL, minH);
+//                        v_pack_store(&minLr[0][x], v_min(minL, minH));
+//                    }
+//                    else
+//                    #endif
+                    {
+                        int minL = MAX_COST;
+
+                        for( d = 0; d < D; d++ )
+                        {
+                            int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                            L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                            Lr_p[d] = (CostType)L;
+                            minL = std::min(minL, L);
+
+                            Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                        }
+                        minLr[0][x] = (CostType)minL;
+                    }
+                }
+
+                // now shift the cyclic buffers
+                std::swap( Lr[0], Lr[1] );
+                std::swap( minLr[0], minLr[1] );
+            }
+        }
+    }
+    static const int NLR = 2;
+    static const int LrBorder = NLR - 1;
+    const Mat& img1;
+    const Mat& img2;
+    CostType* Cbuf;
+    CostType* Sbuf;
+    CostType* hsumBuf;
+    PixType* clipTab;
+    int minD;
+    int maxD;
+    int D;
+    int D2;
+    int SH2;
+    int SW2;
+    int width;
+    int width1;
+    int height;
+    int P1;
+    int P2;
+    size_t costBufSize;
+    size_t CSBufSize;
+    size_t minLrSize;
+    size_t LrSize;
+    size_t hsumBufNRows;
+    int ftzero;
+};
 
 /*
  This is new experimential version of disparity calculation, which should be parralled after
@@ -854,7 +1193,6 @@ TODO: Don't forget to rewrire this commentaries after
  disp2cost also has the same size as img1 (or img2).
  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
-#include<stdio.h>  //TODO: DUBUG!!!
 static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
@@ -893,7 +1231,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     int D = maxD - minD, width1 = maxX1 - minX1;
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+    int SH2 = SADWindowSize.height/2;
     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
     PixType clipTab[TAB_SIZE];
 
@@ -922,14 +1260,13 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*height;                                            //4e: For HH mode it's better to keep whole array of costs.
-    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;                  //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
+    size_t CSBufSize = costBufSize*height;
+    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;        //TODO: We don't need LrBorder for vertical passes and we don't need Lr buffer for horizontal passes.
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
-    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
-    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf                    //4e: TODO: Why we should increase sum window height one more time?
+    CSBufSize*2*sizeof(CostType) +                                    // C, S                       //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
+    width*(sizeof(CostType) + sizeof(DispType)) + 1024;               // disp2cost + disp2
 
     if( buffer.empty() || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
@@ -939,139 +1276,51 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
     CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
     CostType* Sbuf = Cbuf + CSBufSize;
     CostType* hsumBuf = Sbuf + CSBufSize;
-    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
+    CostType* disp2cost = hsumBuf + costBufSize*hsumBufNRows + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
     DispType* disp2ptr = (DispType*)(disp2cost + width);
-    PixType* tempBuf = (PixType*)(disp2ptr + width);
 
     // add P2 to every C(x,y). it saves a few operations in the inner loops
     for(k = 0; k < (int)CSBufSize; k++ )
         Cbuf[k] = (CostType)P2;
 
-    // Two vertical passes - up to down and down to up
-    for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
-    {
-        int y1, y2, dy;
+    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab));
 
-        if( pass == 1 )
-        {
-            y1 = 0; y2 = height; dy = 1;
-        }
-        else
-        {
-            y1 = height-1; y2 = -1; dy = -1;
-        }
+//    for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
+    {
 
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+        CostType *Lr, *minLr;
 
-        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
         {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
             // and will occasionally use negative indices with the arrays
             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
             // however, then the alignment will be imperfect, i.e. bad for SSE,
             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8;
-            memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder;
-            memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) );
+            Lr = hsumBuf + costBufSize*hsumBufNRows + D2*LrBorder + 8;
+            memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
+            minLr = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + LrBorder;
+            memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) );
         }
 
-        for( int y = y1; y != y2; y += dy )
+        for( int y = 0; y != height; y++)
         {
             int x, d;
+            DispType* disp1ptr = disp1.ptr<DispType>(y);
             CostType* C = Cbuf + y*costBufSize;
             CostType* S = Sbuf + y*costBufSize;
 
-            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+            for( x = 0; x < width; x++ )
             {
-                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
-
-                for( k = dy1; k <= dy2; k++ )
-                {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
-
-                    if( k < height )
-                    {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
-
-                        memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
-                        {
-                            int scale = x == 0 ? SW2 + 1 : 1;
-                            for( d = 0; d < D; d++ )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
-                        }
-
-                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
-                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
-
-                            for( x = D; x < width1*D; x += D )
-                            {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-//                            #if CV_SIMD128
-//                                if( useSIMD )
-//                                {
-//                                    for( d = 0; d < D; d += 8 )
-//                                    {
-//                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
-//                                        v_int16x8 Cx = v_load(Cprev + x + d);
-//                                        v_int16x8 psub = v_load(pixSub + d);
-//                                        v_int16x8 padd = v_load(pixAdd + d);
-//                                        hv = (hv - psub + padd);
-//                                        psub = v_load(hsumSub + x + d);
-//                                        Cx = Cx - psub + hv;
-//                                        v_store(hsumAdd + x + d, hv);
-//                                        v_store(C + x + d, Cx);
-//                                    }
-//                                }
-//                                else
-//                            #endif
-                                {
-                                    for( d = 0; d < D; d++ )
-                                    {
-                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
-                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-                                for( d = 0; d < D; d++ )
-                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                            }
-                        }
-                    }
-
-                    if( y == 0 )        //4e: Calculating first full block sum.
-                    {
-                        int scale = k == 0 ? SH2 + 1 : 1;
-                        for( x = 0; x < width1*D; x++ )
-                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
-                    }
-                }
-
-                // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
-                    S[k] = 0;
+                disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+                disp2cost[x] = MAX_COST;
             }
 
             // clear the left and the right borders
-            memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
-            memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
-            memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) );
-            memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) );
+            memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
+            memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
+            memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
+            memset( minLr + width1, 0, LrBorder*sizeof(CostType) );
 
             /*
              [formula 13 in the paper]
@@ -1091,17 +1340,17 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
              6: r=(1, -dy*2)
              7: r=(2, -dy)
              */
-            for( x = 0; x != width1; x++ )
+            for( x = 0; x != width1; x++)
             {
                 int xd = x*D2;
 
-                int delta = minLr[1][x] + P2;
+                int delta = minLr[x - 1] + P2;
 
-                CostType* Lr_ppr = Lr[1] + xd;
+                CostType* Lr_ppr = Lr + xd - D2;
 
                 Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
 
-                CostType* Lr_p = Lr[0] + xd;
+                CostType* Lr_p = Lr + xd;
                 const CostType* Cp = C + x*D;
                 CostType* Sp = S + x*D;
 
@@ -1121,19 +1370,19 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
 //                        v_int16x8 Cpd = v_load(Cp + d);
 //                        v_int16x8 L0, L1, L2, L3;
 //
-//                        L0 = v_load(Lr_p0 + d);
+//                        L0 = v_load(Lr_ppr + d);
 //                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_ppr + d);
+//                        L2 = v_load(Lr_p2 + d);
 //                        L3 = v_load(Lr_p3 + d);
 //
-//                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
 //
 //                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
 //                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
 //
-//                        L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
 //
 //                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
 //                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
@@ -1177,7 +1426,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
 //
 //                    v_int32x4 minL, minH;
 //                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[0][x], v_min(minL, minH));
+//                    v_pack_store(&minLr[x], v_min(minL, minH));
 //                }
 //                else
 //            #endif
@@ -1195,15 +1444,578 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
 
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
-                    minLr[0][x] = (CostType)minL;
+                    minLr[x] = (CostType)minL;
                 }
             }
 
-            // now shift the cyclic buffers
-            std::swap( Lr[0], Lr[1] );
+            for( x = width1-1; x != -1; x--)
+            {
+                int xd = x*D2;
+
+                int delta = minLr[x + 1] + P2;
+
+                CostType* Lr_ppr = Lr + xd + D2;
+
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                CostType* Lr_p = Lr + xd;
+                const CostType* Cp = C + x*D;
+                CostType* Sp = S + x*D;
+                int minS = MAX_COST, bestDisp = -1;
+
+//            #if CV_SIMD128
+//                if( useSIMD )
+//                {
+//                    v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                    for( d = 0; d < D; d += 8 )
+//                    {
+//                        v_int16x8 Cpd = v_load(Cp + d);
+//                        v_int16x8 L0, L1, L2, L3;
+//
+//                        L0 = v_load(Lr_ppr + d);
+//                        L1 = v_load(Lr_p1 + d);
+//                        L2 = v_load(Lr_p2 + d);
+//                        L3 = v_load(Lr_p3 + d);
+//
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
+//
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                        L0 = v_min(L0, _delta0);
+//                        L0 = ((L0 - _delta0) + Cpd);
+//
+//                        L1 = v_min(L1, _delta1);
+//                        L1 = ((L1 - _delta1) + Cpd);
+//
+//                        L2 = v_min(L2, _delta2);
+//                        L2 = ((L2 - _delta2) + Cpd);
+//
+//                        L3 = v_min(L3, _delta3);
+//                        L3 = ((L3 - _delta3) + Cpd);
+//
+//                        v_store(Lr_p + d, L0);
+//                        v_store(Lr_p + d + D2, L1);
+//                        v_store(Lr_p + d + D2*2, L2);
+//                        v_store(Lr_p + d + D2*3, L3);
+//
+//                        // Get minimum from in L0-L3
+//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                        v_int16x8 t0 = v_min(t0123L, t0123H);
+//                        _minL0 = v_min(_minL0, t0);
+//
+//                        v_int16x8 Sval = v_load(Sp + d);
+//
+//                        L0 = L0 + L1;
+//                        L2 = L2 + L3;
+//                        Sval = Sval + L0;
+//                        Sval = Sval + L2;
+//
+//                        v_store(Sp + d, Sval);
+//                    }
+//
+//                    v_int32x4 minL, minH;
+//                    v_expand(_minL0, minL, minH);
+//                    v_pack_store(&minLr[x], v_min(minL, minH));
+//                }
+//                else
+//            #endif
+//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
+//                #if CV_SIMD128
+//                    if( useSIMD )
+//                    {
+//                        v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
+//                        v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
+//
+//                        for( d = 0; d < D; d+= 8 )
+//                        {
+//                            v_int16x8 L0 = v_load(Sp + d);
+//                            v_int16x8 mask = L0 < _minS;
+//                            _minS = v_min( L0, _minS );
+//                            _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
+//                            _d8 = _d8 + _8;
+//                        }
+//                        v_int32x4 _d0, _d1;
+//                        v_expand(_minS, _d0, _d1);
+//                        minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+//                        v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
+//
+//                        _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
+//                        v_expand(_bestDisp, _d0, _d1);
+//                        bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+//                    }
+//                    else
+//                #endif
+                {
+                    int minL = MAX_COST;
+
+                    for( d = 0; d < D; d++ )
+                    {
+                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                        Lr_p[d] = (CostType)L;
+                        minL = std::min(minL, L);
+
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                        if( Sp[d] < minS )
+                        {
+                            minS = Sp[d];
+                            bestDisp = d;
+                        }
+                    }
+                    minLr[x] = (CostType)minL;
+                }
+                //Some postprocessing procedures and saving
+                for( d = 0; d < D; d++ )
+                {
+                    if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
+                        break;
+                }
+                if( d < D )
+                    continue;
+                d = bestDisp;
+                int _x2 = x + minX1 - d - minD;
+                if( disp2cost[_x2] > minS )
+                {
+                    disp2cost[_x2] = (CostType)minS;
+                    disp2ptr[_x2] = (DispType)(d + minD);
+                }
+
+                if( 0 < d && d < D-1 )
+                {
+                    // do subpixel quadratic interpolation:
+                    //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
+                    //   then find minimum of the parabola.
+                    int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
+                    d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
+                }
+                else
+                    d *= DISP_SCALE;
+                disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
+            }
+            //Left-right check sanity procedure
+            for( x = minX1; x < maxX1; x++ )
+            {
+                // we round the computed disparity both towards -inf and +inf and check
+                // if either of the corresponding disparities in disp2 is consistent.
+                // This is to give the computed disparity a chance to look valid if it is.
+                int d1 = disp1ptr[x];
+                if( d1 == INVALID_DISP_SCALED )
+                    continue;
+                int _d = d1 >> DISP_SHIFT;
+                int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
+                int _x = x - _d, x_ = x - d_;
+                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
+                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
+                    disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
+            }
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+ This is new experimential version of disparity calculation, which should be parralled after
+TODO: Don't forget to rewrire this commentaries after
+
+ computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
+ that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
+ minD <= d < maxD.
+ disp2full is the reverse disparity map, that is:
+ disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
+
+ note that disp1buf will have the same size as the roi and
+ disp2full will have the same size as img1 (or img2).
+ On exit disp2buf is not the final disparity, it is an intermediate result that becomes
+ final after all the tiles are processed.
+
+ the disparity in disp1buf is written with sub-pixel accuracy
+ (4 fractional bits, see StereoSGBM::DISP_SCALE),
+ using quadratic interpolation, while the disparity in disp2buf
+ is written as is, without interpolation.
+
+ disp2cost also has the same size as img1 (or img2).
+ It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
+ */
+static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2,
+                                 Mat& disp1, const StereoSGBMParams& params,
+                                 Mat& buffer )
+{
+//#if CV_SIMD128
+//    // maxDisparity is supposed to multiple of 16, so we can forget doing else
+//    static const uchar LSBTab[] =
+//    {
+//        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+//    };
+//    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+//
+//    bool useSIMD = hasSIMD128();
+//#endif
+
+    const int ALIGN = 16;
+    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
+    const int DISP_SCALE = (1 << DISP_SHIFT);
+    const CostType MAX_COST = SHRT_MAX;
+
+    int minD = params.minDisparity, maxD = minD + params.numDisparities;
+    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
+    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
+    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
+    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+    int k, width = disp1.cols, height = disp1.rows;
+    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
+    int D = maxD - minD, width1 = maxX1 - minX1;
+    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
+    PixType clipTab[TAB_SIZE];
+
+    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
+        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+
+    if( minX1 >= maxX1 )
+    {
+        disp1 = Scalar::all(INVALID_DISP_SCALED);
+        return;
+    }
+
+    CV_Assert( D % 16 == 0 );                    //TODO: Are you sure? By the way, why not 8?
+
+    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
+    // if you change NR, please, modify the loop as well.
+    int D2 = D+16;                  //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
+
+    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+    // for 8-way dynamic programming we need the current row and
+    // the previous row, i.e. 2 rows in total
+    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
+    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
+
+    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
+    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+    size_t costBufSize = width1*D;
+    size_t CSBufSize = costBufSize*height;                                            //4e: For HH mode it's better to keep whole array of costs.
+    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;                  //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
+    int hsumBufNRows = SH2*2 + 2;
+    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
+    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
+    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
+    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
+    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+
+    if( buffer.empty() || !buffer.isContinuous() ||
+        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
+        buffer.create(1, (int)totalBufSize, CV_8U);
+
+    // summary cost over different (nDirs) directions
+    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+    CostType* Sbuf = Cbuf + CSBufSize;
+    CostType* hsumBuf = Sbuf + CSBufSize;
+    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
+
+    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
+    DispType* disp2ptr = (DispType*)(disp2cost + width);
+    PixType* tempBuf = (PixType*)(disp2ptr + width);
+
+    // add P2 to every C(x,y). it saves a few operations in the inner loops
+    for(k = 0; k < (int)CSBufSize; k++ )
+        Cbuf[k] = (CostType)P2;
+
+    // Two vertical passes - up to down and down to up
+    for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
+    {
+        int y1, y2, dy;
+
+        if( pass == 1 )
+        {
+            y1 = 0; y2 = height; dy = 1;
+        }
+        else
+        {
+            y1 = height-1; y2 = -1; dy = -1;
+        }
+
+        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+
+        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
+        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+            // and will occasionally use negative indices with the arrays
+            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+            // however, then the alignment will be imperfect, i.e. bad for SSE,
+            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+            Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8;
+            memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
+            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder;
+            memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) );
+        }
+
+        for( int y = y1; y != y2; y += dy )
+        {
+            int x, d;
+            CostType* C = Cbuf + y*costBufSize;
+            CostType* S = Sbuf + y*costBufSize;
+
+            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+            {
+                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
+
+                for( k = dy1; k <= dy2; k++ )
+                {
+                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
+
+                    if( k < height )
+                    {
+                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+
+                        memset(hsumAdd, 0, D*sizeof(CostType));
+                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
+                        {
+                            int scale = x == 0 ? SW2 + 1 : 1;
+                            for( d = 0; d < D; d++ )
+                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+                        }
+
+                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
+                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
+                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                            const CostType* Cprev = C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
+
+                            for( x = D; x < width1*D; x += D )
+                            {
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+//                            #if CV_SIMD128
+//                                if( useSIMD )
+//                                {
+//                                    for( d = 0; d < D; d += 8 )
+//                                    {
+//                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
+//                                        v_int16x8 Cx = v_load(Cprev + x + d);
+//                                        v_int16x8 psub = v_load(pixSub + d);
+//                                        v_int16x8 padd = v_load(pixAdd + d);
+//                                        hv = (hv - psub + padd);
+//                                        psub = v_load(hsumSub + x + d);
+//                                        Cx = Cx - psub + hv;
+//                                        v_store(hsumAdd + x + d, hv);
+//                                        v_store(C + x + d, Cx);
+//                                    }
+//                                }
+//                                else
+//                            #endif
+                                {
+                                    for( d = 0; d < D; d++ )
+                                    {
+                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
+                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+                                for( d = 0; d < D; d++ )
+                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                            }
+                        }
+                    }
+
+                    if( y == 0 )        //4e: Calculating first full block sum.
+                    {
+                        int scale = k == 0 ? SH2 + 1 : 1;
+                        for( x = 0; x < width1*D; x++ )
+                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+                    }
+                }
+
+                // also, clear the S buffer
+                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
+                    S[k] = 0;
+            }
+
+            // clear the left and the right borders
+            memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
+            memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
+            memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) );
+            memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) );
+
+            /*
+             [formula 13 in the paper]
+             compute L_r(p, d) = C(p, d) +
+             min(L_r(p-r, d),
+             L_r(p-r, d-1) + P1,
+             L_r(p-r, d+1) + P1,
+             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+             where p = (x,y), r is one of the directions.
+             we process all the directions at once:
+             0: r=(-dx, 0)
+             1: r=(-1, -dy)
+             2: r=(0, -dy)
+             3: r=(1, -dy)
+             4: r=(-2, -dy)
+             5: r=(-1, -dy*2)
+             6: r=(1, -dy*2)
+             7: r=(2, -dy)
+             */
+            for( x = 0; x != width1; x++ )
+            {
+                int xd = x*D2;
+
+                int delta = minLr[1][x] + P2;
+
+                CostType* Lr_ppr = Lr[1] + xd;
+
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
+
+                CostType* Lr_p = Lr[0] + xd;
+                const CostType* Cp = C + x*D;
+                CostType* Sp = S + x*D;
+
+//            #if CV_SIMD128
+//                if( useSIMD )
+//                {
+//                    v_int16x8 _P1 = v_setall_s16((short)P1);
+//
+//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
+//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
+//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
+//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
+//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+//
+//                    for( d = 0; d < D; d += 8 )
+//                    {
+//                        v_int16x8 Cpd = v_load(Cp + d);
+//                        v_int16x8 L0, L1, L2, L3;
+//
+//                        L0 = v_load(Lr_p0 + d);
+//                        L1 = v_load(Lr_p1 + d);
+//                        L2 = v_load(Lr_ppr + d);
+//                        L3 = v_load(Lr_p3 + d);
+//
+//                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
+//                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
+//
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
+//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
+//
+//                        L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
+//                        L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
+//
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
+//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
+//
+//                        L0 = v_min(L0, _delta0);
+//                        L0 = ((L0 - _delta0) + Cpd);
+//
+//                        L1 = v_min(L1, _delta1);
+//                        L1 = ((L1 - _delta1) + Cpd);
+//
+//                        L2 = v_min(L2, _delta2);
+//                        L2 = ((L2 - _delta2) + Cpd);
+//
+//                        L3 = v_min(L3, _delta3);
+//                        L3 = ((L3 - _delta3) + Cpd);
+//
+//                        v_store(Lr_p + d, L0);
+//                        v_store(Lr_p + d + D2, L1);
+//                        v_store(Lr_p + d + D2*2, L2);
+//                        v_store(Lr_p + d + D2*3, L3);
+//
+//                        // Get minimum from in L0-L3
+//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
+//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
+//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
+//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
+//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
+//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
+//                        v_int16x8 t0 = v_min(t0123L, t0123H);
+//                        _minL0 = v_min(_minL0, t0);
+//
+//                        v_int16x8 Sval = v_load(Sp + d);
+//
+//                        L0 = L0 + L1;
+//                        L2 = L2 + L3;
+//                        Sval = Sval + L0;
+//                        Sval = Sval + L2;
+//
+//                        v_store(Sp + d, Sval);
+//                    }
+//
+//                    v_int32x4 minL, minH;
+//                    v_expand(_minL0, minL, minH);
+//                    v_pack_store(&minLr[0][x], v_min(minL, minH));
+//                }
+//                else
+//            #endif
+                {
+                    int minL = MAX_COST;
+
+                    for( d = 0; d < D; d++ )
+                    {
+                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                        Lr_p[d] = (CostType)L;
+                        minL = std::min(minL, L);
+
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                    }
+                    minLr[0][x] = (CostType)minL;
+                }
+            }
+
+            // now shift the cyclic buffers
+            std::swap( Lr[0], Lr[1] );
             std::swap( minLr[0], minLr[1] );
         }
     }
+//    for(int y = 0; y<height;y++)            //TODO:DUBUG!!!
+//        for(int x = 0; x<width1;x++)
+//        {
+//            printf("%d %d:",y,x);
+//            for(int d = 0; d<D;d++)
+//                printf("%d ",Cbuf[y*costBufSize+x*D+d]);
+//            printf("\n");
+//        }
 
 //    for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
     {
@@ -1552,10 +2364,8 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
         }
     }
 }
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
                        PixType*& tmpBuf, CostType*& horPassCostVolume,
@@ -2208,6 +3018,8 @@ public:
 
         if(params.mode==MODE_SGBM_3WAY)
             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
+        else if(params.mode==MODE_HH4_OLD)
+            computeDisparitySGBMParallelOld( left, right, disp, params, buffer );
         else if(params.mode==MODE_HH4)
             computeDisparitySGBMParallel( left, right, disp, params, buffer );
         else
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index c51c419334..e1b5c5200e 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -797,20 +797,23 @@ TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere)
 //                                 int mode)
     Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png");
     Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png");
+    Mat leftDisp_old, leftDisp_new;
     {
         Mat leftDisp;
-        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH);
-        sgbm->compute( leftImg, rightImg, leftDisp );
-        CV_Assert( leftDisp.type() == CV_16SC1 );
-        leftDisp/=8;
-        imwrite( "/home/q/Work/GitVault/modehh.jpg", leftDisp);
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
+        sgbm->compute( leftImg, rightImg, leftDisp_new );
+        CV_Assert( leftDisp_new.type() == CV_16SC1 );
+//        leftDisp/=8;
+//        imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp);
     }
     {
-        Mat leftDisp;
-        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
-        sgbm->compute( leftImg, rightImg, leftDisp );
-        CV_Assert( leftDisp.type() == CV_16SC1 );
-        leftDisp/=8;
-        imwrite( "/home/q/Work/GitVault/modehh4.jpg", leftDisp);
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4_OLD);
+        sgbm->compute( leftImg, rightImg, leftDisp_old );
+        CV_Assert( leftDisp_old.type() == CV_16SC1 );
+//        leftDisp/=8;
+//        imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp);
     }
+    Mat diff;
+    absdiff(leftDisp_old,leftDisp_new,diff);
+    CV_Assert( countNonZero(diff)==0);
 }

From 7ffa49e02ce2dbfea242a905251cb64bfc97aa6d Mon Sep 17 00:00:00 2001
From: Pyotr Chekmaryov <4ekmah@gmail.com>
Date: Sat, 22 Apr 2017 00:00:08 +0000
Subject: [PATCH 4/6] Vertical passes added and we have working parralel
 version.

---
 modules/calib3d/include/opencv2/calib3d.hpp  |   3 +-
 modules/calib3d/src/stereosgbm.cpp           | 868 ++-----------------
 modules/calib3d/test/test_stereomatching.cpp |  30 +-
 3 files changed, 106 insertions(+), 795 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index f2915850f9..46a470bc0f 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1810,8 +1810,7 @@ public:
         MODE_SGBM = 0,
         MODE_HH   = 1,
         MODE_SGBM_3WAY = 2,
-        MODE_HH4  = 3,
-        MODE_HH4_OLD  = 4
+        MODE_HH4  = 3
     };
 
     CV_WRAP virtual int getPreFilterCap() const = 0;
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 8a6bfebc67..5f841775ec 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -1170,140 +1170,49 @@ struct CalcVerticalSums: public ParallelLoopBody
     int ftzero;
 };
 
-/*
- This is new experimential version of disparity calculation, which should be parralled after
-TODO: Don't forget to rewrire this commentaries after
-
- computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
- that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
- minD <= d < maxD.
- disp2full is the reverse disparity map, that is:
- disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
-
- note that disp1buf will have the same size as the roi and
- disp2full will have the same size as img1 (or img2).
- On exit disp2buf is not the final disparity, it is an intermediate result that becomes
- final after all the tiles are processed.
-
- the disparity in disp1buf is written with sub-pixel accuracy
- (4 fractional bits, see StereoSGBM::DISP_SCALE),
- using quadratic interpolation, while the disparity in disp2buf
- is written as is, without interpolation.
-
- disp2cost also has the same size as img1 (or img2).
- It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
- */
-static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoSGBMParams& params,
-                                 Mat& buffer )
+struct CalcHorizontalSums: public ParallelLoopBody
 {
-//#if CV_SIMD128
-//    // maxDisparity is supposed to multiple of 16, so we can forget doing else
-//    static const uchar LSBTab[] =
-//    {
-//        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-//    };
-//    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-//
-//    bool useSIMD = hasSIMD128();
-//#endif
-
-    const int ALIGN = 16;
-    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
-    const int DISP_SCALE = (1 << DISP_SHIFT);
-    const CostType MAX_COST = SHRT_MAX;
-
-    int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
-    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
-    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
-    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
-    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
-    int k, width = disp1.cols, height = disp1.rows;
-    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
-    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SH2 = SADWindowSize.height/2;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
-
-    if( minX1 >= maxX1 )
+    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
+                     CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
     {
-        disp1 = Scalar::all(INVALID_DISP_SCALED);
-        return;
+        minD = params.minDisparity;
+        maxD = minD + params.numDisparities;
+        P1 = params.P1 > 0 ? params.P1 : 2;
+        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+        uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+        disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+        height = img1.rows;
+        width = img1.cols;
+        minX1 = std::max(maxD, 0);
+        maxX1 = width + std::min(minD, 0);
+        INVALID_DISP = minD - 1;
+        INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+        D = maxD - minD;
+        width1 = maxX1 - minX1;
+        costBufSize = width1*D;
+        CSBufSize = costBufSize*height;
+        D2 = D + 16;
+        LrSize = 2 * D2;               //TODO: Check: do we need this value or not?
+        Cbuf = alignedBuf;
+        Sbuf = Cbuf + CSBufSize;
     }
 
-    CV_Assert( D % 16 == 0 );                    //TODO: Are you sure? By the way, why not 8?
-
-    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
-    // if you change NR, please, modify the loop as well.
-    int D2 = D+16;                  //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
-
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
-    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
-
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-    size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*height;
-    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;        //TODO: We don't need LrBorder for vertical passes and we don't need Lr buffer for horizontal passes.
-    int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf                    //4e: TODO: Why we should increase sum window height one more time?
-    CSBufSize*2*sizeof(CostType) +                                    // C, S                       //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
-    width*(sizeof(CostType) + sizeof(DispType)) + 1024;               // disp2cost + disp2
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.create(1, (int)totalBufSize, CV_8U);
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
-    CostType* Sbuf = Cbuf + CSBufSize;
-    CostType* hsumBuf = Sbuf + CSBufSize;
-
-    CostType* disp2cost = hsumBuf + costBufSize*hsumBufNRows + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
-    DispType* disp2ptr = (DispType*)(disp2cost + width);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for(k = 0; k < (int)CSBufSize; k++ )
-        Cbuf[k] = (CostType)P2;
-
-    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab));
-
-//    for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
+    void operator()( const Range& range ) const
     {
+        int y1 = range.start, y2 = range.end;
+        static const CostType MAX_COST = SHRT_MAX;
+        static const int ALIGN = 16;
+        size_t auxBufsSize = LrSize + width*(sizeof(CostType) + sizeof(DispType)) + 32;
 
-        CostType *Lr, *minLr;
+        Mat auxBuff;
+        auxBuff.create(1, (int)auxBufsSize, CV_8U);
+        CostType *Lr = (CostType*)alignPtr(auxBuff.ptr(), ALIGN) + 8;
+        CostType* disp2cost = Lr + LrSize;
+        DispType* disp2ptr = (DispType*)(disp2cost + width);
 
-        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr = hsumBuf + costBufSize*hsumBufNRows + D2*LrBorder + 8;
-            memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + LrBorder;
-            memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) );
-        }
+        CostType minLr;
 
-        for( int y = 0; y != height; y++)
+        for( int y = y1; y != y2; y++)
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
@@ -1316,12 +1225,9 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                 disp2cost[x] = MAX_COST;
             }
 
-            // clear the left and the right borders
-            memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
-            memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
-            memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
-            memset( minLr + width1, 0, LrBorder*sizeof(CostType) );
-
+            // clear buffers
+            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
+            minLr = 0;
             /*
              [formula 13 in the paper]
              compute L_r(p, d) = C(p, d) +
@@ -1342,15 +1248,13 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
              */
             for( x = 0; x != width1; x++)
             {
-                int xd = x*D2;
+                int delta = minLr + P2;
 
-                int delta = minLr[x - 1] + P2;
+                CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
 
-                CostType* Lr_ppr = Lr + xd - D2;
+                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;       //TODO: Well, probably, it's better do this once before.
 
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
-
-                CostType* Lr_p = Lr + xd;
+                CostType* Lr_p = Lr + ((x&1)? D2 :0);
                 const CostType* Cp = C + x*D;
                 CostType* Sp = S + x*D;
 
@@ -1444,21 +1348,22 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
 
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
-                    minLr[x] = (CostType)minL;
+                    minLr = (CostType)minL;
                 }
             }
 
+            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
+            minLr = 0;
+
             for( x = width1-1; x != -1; x--)
             {
-                int xd = x*D2;
-
-                int delta = minLr[x + 1] + P2;
+                int delta = minLr + P2;
 
-                CostType* Lr_ppr = Lr + xd + D2;
+                CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
 
                 Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
 
-                CostType* Lr_p = Lr + xd;
+                CostType* Lr_p = Lr + ((x&1)? D2 :0);
                 const CostType* Cp = C + x*D;
                 CostType* Sp = S + x*D;
                 int minS = MAX_COST, bestDisp = -1;
@@ -1584,7 +1489,7 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                             bestDisp = d;
                         }
                     }
-                    minLr[x] = (CostType)minL;
+                    minLr = (CostType)minL;
                 }
                 //Some postprocessing procedures and saving
                 for( d = 0; d < D; d++ )
@@ -1632,10 +1537,35 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
             }
         }
     }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
 
+    static const int NLR = 2;
+    static const int LrBorder = NLR - 1;
+    static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
+    static const int DISP_SCALE = (1 << DISP_SHIFT);
+    const Mat& img1;
+    const Mat& img2;
+    Mat& disp1;
+    CostType* Cbuf;
+    CostType* Sbuf;
+    int minD;
+    int maxD;
+    int D;
+    int D2;
+    int width;
+    int width1;
+    int height;
+    int P1;
+    int P2;
+    int minX1;
+    int maxX1;
+    size_t costBufSize;
+    size_t CSBufSize;
+    size_t LrSize;
+    int INVALID_DISP;
+    int INVALID_DISP_SCALED;
+    int uniquenessRatio;
+    int disp12MaxDiff;
+};
 /*
  This is new experimential version of disparity calculation, which should be parralled after
 TODO: Don't forget to rewrire this commentaries after
@@ -1659,7 +1589,7 @@ TODO: Don't forget to rewrire this commentaries after
  disp2cost also has the same size as img1 (or img2).
  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
-static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2,
+static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
 {
@@ -1684,20 +1614,17 @@ static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2,
     const int ALIGN = 16;
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
-    const CostType MAX_COST = SHRT_MAX;
-
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
     Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
     int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
-    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
-    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     int D = maxD - minD, width1 = maxX1 - minX1;
-    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+    int SH2 = SADWindowSize.height/2;
+    int INVALID_DISP = minD - 1;
+    int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
     PixType clipTab[TAB_SIZE];
 
@@ -1726,14 +1653,12 @@ static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2,
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*height;                                            //4e: For HH mode it's better to keep whole array of costs.
-    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;                  //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
+    size_t CSBufSize = costBufSize*height;
+    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;        //TODO: We don't need LrBorder for vertical passes and we don't need Lr buffer for horizontal passes.
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
-    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
-    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf                    //4e: TODO: Why we should increase sum window height one more time?
+    CSBufSize*2*sizeof(CostType) + 1024;                              // C, S                       //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
 
     if( buffer.empty() || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
@@ -1741,629 +1666,16 @@ static void computeDisparitySGBMParallelOld( const Mat& img1, const Mat& img2,
 
     // summary cost over different (nDirs) directions
     CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
-    CostType* Sbuf = Cbuf + CSBufSize;
-    CostType* hsumBuf = Sbuf + CSBufSize;
-    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
-
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
-    DispType* disp2ptr = (DispType*)(disp2cost + width);
-    PixType* tempBuf = (PixType*)(disp2ptr + width);
 
     // add P2 to every C(x,y). it saves a few operations in the inner loops
     for(k = 0; k < (int)CSBufSize; k++ )
         Cbuf[k] = (CostType)P2;
 
-    // Two vertical passes - up to down and down to up
-    for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
-    {
-        int y1, y2, dy;
-
-        if( pass == 1 )
-        {
-            y1 = 0; y2 = height; dy = 1;
-        }
-        else
-        {
-            y1 = height-1; y2 = -1; dy = -1;
-        }
-
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
-
-        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
-        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr[k] = pixDiff + costBufSize + LrSize*k + D2*LrBorder + 8;
-            memset( Lr[k] - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + LrBorder;
-            memset( minLr[k] - LrBorder, 0, minLrSize*sizeof(CostType) );
-        }
-
-        for( int y = y1; y != y2; y += dy )
-        {
-            int x, d;
-            CostType* C = Cbuf + y*costBufSize;
-            CostType* S = Sbuf + y*costBufSize;
-
-            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
-            {
-                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
-
-                for( k = dy1; k <= dy2; k++ )
-                {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
-
-                    if( k < height )
-                    {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
-
-                        memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
-                        {
-                            int scale = x == 0 ? SW2 + 1 : 1;
-                            for( d = 0; d < D; d++ )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
-                        }
-
-                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
-                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
+    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab));
+    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf));
 
-                            for( x = D; x < width1*D; x += D )
-                            {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+}
 
-//                            #if CV_SIMD128
-//                                if( useSIMD )
-//                                {
-//                                    for( d = 0; d < D; d += 8 )
-//                                    {
-//                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
-//                                        v_int16x8 Cx = v_load(Cprev + x + d);
-//                                        v_int16x8 psub = v_load(pixSub + d);
-//                                        v_int16x8 padd = v_load(pixAdd + d);
-//                                        hv = (hv - psub + padd);
-//                                        psub = v_load(hsumSub + x + d);
-//                                        Cx = Cx - psub + hv;
-//                                        v_store(hsumAdd + x + d, hv);
-//                                        v_store(C + x + d, Cx);
-//                                    }
-//                                }
-//                                else
-//                            #endif
-                                {
-                                    for( d = 0; d < D; d++ )
-                                    {
-                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
-                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-                                for( d = 0; d < D; d++ )
-                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                            }
-                        }
-                    }
-
-                    if( y == 0 )        //4e: Calculating first full block sum.
-                    {
-                        int scale = k == 0 ? SH2 + 1 : 1;
-                        for( x = 0; x < width1*D; x++ )
-                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
-                    }
-                }
-
-                // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
-                    S[k] = 0;
-            }
-
-            // clear the left and the right borders
-            memset( Lr[0] - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
-            memset( Lr[0] + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
-            memset( minLr[0] - LrBorder, 0, LrBorder*sizeof(CostType) );
-            memset( minLr[0] + width1, 0, LrBorder*sizeof(CostType) );
-
-            /*
-             [formula 13 in the paper]
-             compute L_r(p, d) = C(p, d) +
-             min(L_r(p-r, d),
-             L_r(p-r, d-1) + P1,
-             L_r(p-r, d+1) + P1,
-             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
-             where p = (x,y), r is one of the directions.
-             we process all the directions at once:
-             0: r=(-dx, 0)
-             1: r=(-1, -dy)
-             2: r=(0, -dy)
-             3: r=(1, -dy)
-             4: r=(-2, -dy)
-             5: r=(-1, -dy*2)
-             6: r=(1, -dy*2)
-             7: r=(2, -dy)
-             */
-            for( x = 0; x != width1; x++ )
-            {
-                int xd = x*D2;
-
-                int delta = minLr[1][x] + P2;
-
-                CostType* Lr_ppr = Lr[1] + xd;
-
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
-
-                CostType* Lr_p = Lr[0] + xd;
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
-
-//            #if CV_SIMD128
-//                if( useSIMD )
-//                {
-//                    v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                    for( d = 0; d < D; d += 8 )
-//                    {
-//                        v_int16x8 Cpd = v_load(Cp + d);
-//                        v_int16x8 L0, L1, L2, L3;
-//
-//                        L0 = v_load(Lr_p0 + d);
-//                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_ppr + d);
-//                        L3 = v_load(Lr_p3 + d);
-//
-//                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
-//
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                        L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                        L0 = v_min(L0, _delta0);
-//                        L0 = ((L0 - _delta0) + Cpd);
-//
-//                        L1 = v_min(L1, _delta1);
-//                        L1 = ((L1 - _delta1) + Cpd);
-//
-//                        L2 = v_min(L2, _delta2);
-//                        L2 = ((L2 - _delta2) + Cpd);
-//
-//                        L3 = v_min(L3, _delta3);
-//                        L3 = ((L3 - _delta3) + Cpd);
-//
-//                        v_store(Lr_p + d, L0);
-//                        v_store(Lr_p + d + D2, L1);
-//                        v_store(Lr_p + d + D2*2, L2);
-//                        v_store(Lr_p + d + D2*3, L3);
-//
-//                        // Get minimum from in L0-L3
-//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                        v_int16x8 t0 = v_min(t0123L, t0123H);
-//                        _minL0 = v_min(_minL0, t0);
-//
-//                        v_int16x8 Sval = v_load(Sp + d);
-//
-//                        L0 = L0 + L1;
-//                        L2 = L2 + L3;
-//                        Sval = Sval + L0;
-//                        Sval = Sval + L2;
-//
-//                        v_store(Sp + d, Sval);
-//                    }
-//
-//                    v_int32x4 minL, minH;
-//                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[0][x], v_min(minL, minH));
-//                }
-//                else
-//            #endif
-                {
-                    int minL = MAX_COST;
-
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
-
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
-
-                        Lr_p[d] = (CostType)L;
-                        minL = std::min(minL, L);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                    }
-                    minLr[0][x] = (CostType)minL;
-                }
-            }
-
-            // now shift the cyclic buffers
-            std::swap( Lr[0], Lr[1] );
-            std::swap( minLr[0], minLr[1] );
-        }
-    }
-//    for(int y = 0; y<height;y++)            //TODO:DUBUG!!!
-//        for(int x = 0; x<width1;x++)
-//        {
-//            printf("%d %d:",y,x);
-//            for(int d = 0; d<D;d++)
-//                printf("%d ",Cbuf[y*costBufSize+x*D+d]);
-//            printf("\n");
-//        }
-
-//    for( int pass = 1; pass <= 2; pass++ ) //pass=1 or left-to-right pass
-    {
-
-        CostType *Lr, *minLr;
-
-        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr = pixDiff + costBufSize + D2*LrBorder + 8;
-            memset( Lr - LrBorder*D2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr = pixDiff + costBufSize + LrSize*NLR + LrBorder;
-            memset( minLr - LrBorder, 0, minLrSize*sizeof(CostType) );
-        }
-
-        for( int y = 0; y != height; y++)
-        {
-            int x, d;
-            DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + y*costBufSize;
-            CostType* S = Sbuf + y*costBufSize;
-
-            for( x = 0; x < width; x++ )
-            {
-                disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
-                disp2cost[x] = MAX_COST;
-            }
-
-            // clear the left and the right borders
-            memset( Lr - D2*LrBorder - 8, 0, D2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
-            memset( Lr + width1*D2 - 8, 0, D2*LrBorder*sizeof(CostType) );       //4e: ...00000000|D2-16 of real costs value(and some of them are zeroes too)|00000000...
-            memset( minLr - LrBorder, 0, LrBorder*sizeof(CostType) );
-            memset( minLr + width1, 0, LrBorder*sizeof(CostType) );
-
-            /*
-             [formula 13 in the paper]
-             compute L_r(p, d) = C(p, d) +
-             min(L_r(p-r, d),
-             L_r(p-r, d-1) + P1,
-             L_r(p-r, d+1) + P1,
-             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
-             where p = (x,y), r is one of the directions.
-             we process all the directions at once:
-             0: r=(-dx, 0)
-             1: r=(-1, -dy)
-             2: r=(0, -dy)
-             3: r=(1, -dy)
-             4: r=(-2, -dy)
-             5: r=(-1, -dy*2)
-             6: r=(1, -dy*2)
-             7: r=(2, -dy)
-             */
-            for( x = 0; x != width1; x++)
-            {
-                int xd = x*D2;
-
-                int delta = minLr[x - 1] + P2;
-
-                CostType* Lr_ppr = Lr + xd - D2;
-
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
-
-                CostType* Lr_p = Lr + xd;
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
-
-//            #if CV_SIMD128
-//                if( useSIMD )
-//                {
-//                    v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                    for( d = 0; d < D; d += 8 )
-//                    {
-//                        v_int16x8 Cpd = v_load(Cp + d);
-//                        v_int16x8 L0, L1, L2, L3;
-//
-//                        L0 = v_load(Lr_ppr + d);
-//                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_p2 + d);
-//                        L3 = v_load(Lr_p3 + d);
-//
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
-//
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                        L0 = v_min(L0, _delta0);
-//                        L0 = ((L0 - _delta0) + Cpd);
-//
-//                        L1 = v_min(L1, _delta1);
-//                        L1 = ((L1 - _delta1) + Cpd);
-//
-//                        L2 = v_min(L2, _delta2);
-//                        L2 = ((L2 - _delta2) + Cpd);
-//
-//                        L3 = v_min(L3, _delta3);
-//                        L3 = ((L3 - _delta3) + Cpd);
-//
-//                        v_store(Lr_p + d, L0);
-//                        v_store(Lr_p + d + D2, L1);
-//                        v_store(Lr_p + d + D2*2, L2);
-//                        v_store(Lr_p + d + D2*3, L3);
-//
-//                        // Get minimum from in L0-L3
-//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                        v_int16x8 t0 = v_min(t0123L, t0123H);
-//                        _minL0 = v_min(_minL0, t0);
-//
-//                        v_int16x8 Sval = v_load(Sp + d);
-//
-//                        L0 = L0 + L1;
-//                        L2 = L2 + L3;
-//                        Sval = Sval + L0;
-//                        Sval = Sval + L2;
-//
-//                        v_store(Sp + d, Sval);
-//                    }
-//
-//                    v_int32x4 minL, minH;
-//                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[x], v_min(minL, minH));
-//                }
-//                else
-//            #endif
-                {
-                    int minL = MAX_COST;
-
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
-
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
-
-                        Lr_p[d] = (CostType)L;
-                        minL = std::min(minL, L);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                    }
-                    minLr[x] = (CostType)minL;
-                }
-            }
-
-            for( x = width1-1; x != -1; x--)
-            {
-                int xd = x*D2;
-
-                int delta = minLr[x + 1] + P2;
-
-                CostType* Lr_ppr = Lr + xd + D2;
-
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
-
-                CostType* Lr_p = Lr + xd;
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
-                int minS = MAX_COST, bestDisp = -1;
-
-//            #if CV_SIMD128
-//                if( useSIMD )
-//                {
-//                    v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                    for( d = 0; d < D; d += 8 )
-//                    {
-//                        v_int16x8 Cpd = v_load(Cp + d);
-//                        v_int16x8 L0, L1, L2, L3;
-//
-//                        L0 = v_load(Lr_ppr + d);
-//                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_p2 + d);
-//                        L3 = v_load(Lr_p3 + d);
-//
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
-//
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                        L0 = v_min(L0, _delta0);
-//                        L0 = ((L0 - _delta0) + Cpd);
-//
-//                        L1 = v_min(L1, _delta1);
-//                        L1 = ((L1 - _delta1) + Cpd);
-//
-//                        L2 = v_min(L2, _delta2);
-//                        L2 = ((L2 - _delta2) + Cpd);
-//
-//                        L3 = v_min(L3, _delta3);
-//                        L3 = ((L3 - _delta3) + Cpd);
-//
-//                        v_store(Lr_p + d, L0);
-//                        v_store(Lr_p + d + D2, L1);
-//                        v_store(Lr_p + d + D2*2, L2);
-//                        v_store(Lr_p + d + D2*3, L3);
-//
-//                        // Get minimum from in L0-L3
-//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                        v_int16x8 t0 = v_min(t0123L, t0123H);
-//                        _minL0 = v_min(_minL0, t0);
-//
-//                        v_int16x8 Sval = v_load(Sp + d);
-//
-//                        L0 = L0 + L1;
-//                        L2 = L2 + L3;
-//                        Sval = Sval + L0;
-//                        Sval = Sval + L2;
-//
-//                        v_store(Sp + d, Sval);
-//                    }
-//
-//                    v_int32x4 minL, minH;
-//                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[x], v_min(minL, minH));
-//                }
-//                else
-//            #endif
-//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
-//                #if CV_SIMD128
-//                    if( useSIMD )
-//                    {
-//                        v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
-//                        v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
-//
-//                        for( d = 0; d < D; d+= 8 )
-//                        {
-//                            v_int16x8 L0 = v_load(Sp + d);
-//                            v_int16x8 mask = L0 < _minS;
-//                            _minS = v_min( L0, _minS );
-//                            _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
-//                            _d8 = _d8 + _8;
-//                        }
-//                        v_int32x4 _d0, _d1;
-//                        v_expand(_minS, _d0, _d1);
-//                        minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-//                        v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
-//
-//                        _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
-//                        v_expand(_bestDisp, _d0, _d1);
-//                        bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-//                    }
-//                    else
-//                #endif
-                {
-                    int minL = MAX_COST;
-
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
-
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
-
-                        Lr_p[d] = (CostType)L;
-                        minL = std::min(minL, L);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                        if( Sp[d] < minS )
-                        {
-                            minS = Sp[d];
-                            bestDisp = d;
-                        }
-                    }
-                    minLr[x] = (CostType)minL;
-                }
-                //Some postprocessing procedures and saving
-                for( d = 0; d < D; d++ )
-                {
-                    if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
-                        break;
-                }
-                if( d < D )
-                    continue;
-                d = bestDisp;
-                int _x2 = x + minX1 - d - minD;
-                if( disp2cost[_x2] > minS )
-                {
-                    disp2cost[_x2] = (CostType)minS;
-                    disp2ptr[_x2] = (DispType)(d + minD);
-                }
-
-                if( 0 < d && d < D-1 )
-                {
-                    // do subpixel quadratic interpolation:
-                    //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
-                    //   then find minimum of the parabola.
-                    int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
-                    d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
-                }
-                else
-                    d *= DISP_SCALE;
-                disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
-            }
-            //Left-right check sanity procedure
-            for( x = minX1; x < maxX1; x++ )
-            {
-                // we round the computed disparity both towards -inf and +inf and check
-                // if either of the corresponding disparities in disp2 is consistent.
-                // This is to give the computed disparity a chance to look valid if it is.
-                int d1 = disp1ptr[x];
-                if( d1 == INVALID_DISP_SCALED )
-                    continue;
-                int _d = d1 >> DISP_SHIFT;
-                int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
-                int _x = x - _d, x_ = x - d_;
-                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
-                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
-                    disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
-            }
-        }
-    }
-}
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
@@ -3018,8 +2330,6 @@ public:
 
         if(params.mode==MODE_SGBM_3WAY)
             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
-        else if(params.mode==MODE_HH4_OLD)
-            computeDisparitySGBMParallelOld( left, right, disp, params, buffer );
         else if(params.mode==MODE_HH4)
             computeDisparitySGBMParallel( left, right, disp, params, buffer );
         else
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index e1b5c5200e..9a338bb11f 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -797,23 +797,25 @@ TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere)
 //                                 int mode)
     Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png");
     Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png");
-    Mat leftDisp_old, leftDisp_new;
+//    Mat leftDisp_old, leftDisp_new;
     {
         Mat leftDisp;
-        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
-        sgbm->compute( leftImg, rightImg, leftDisp_new );
-        CV_Assert( leftDisp_new.type() == CV_16SC1 );
-//        leftDisp/=8;
-//        imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp);
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH);
+        sgbm->compute( leftImg, rightImg, leftDisp);
+        CV_Assert( leftDisp.type() == CV_16SC1 );
+        leftDisp/=8;
+        imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp);
     }
     {
-        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4_OLD);
-        sgbm->compute( leftImg, rightImg, leftDisp_old );
-        CV_Assert( leftDisp_old.type() == CV_16SC1 );
-//        leftDisp/=8;
-//        imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp);
+        Mat leftDisp;
+        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
+        sgbm->compute( leftImg, rightImg, leftDisp);
+        CV_Assert( leftDisp.type() == CV_16SC1 );
+        leftDisp/=8;
+        imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp);
     }
-    Mat diff;
-    absdiff(leftDisp_old,leftDisp_new,diff);
-    CV_Assert( countNonZero(diff)==0);
+//    Mat diff;
+//    absdiff(leftDisp_old,leftDisp_new,diff);
+//    CV_Assert( countNonZero(diff)==0);
+//
 }

From 21be2aa677d8eaad48956b431260e64f626167a0 Mon Sep 17 00:00:00 2001
From: Pyotr Chekmaryov <4ekmah@gmail.com>
Date: Tue, 25 Apr 2017 21:00:31 +0000
Subject: [PATCH 5/6] Memory repaired + Cleanup.

---
 modules/calib3d/src/stereosgbm.cpp           | 594 ++++---------------
 modules/calib3d/test/test_stereomatching.cpp |  39 +-
 2 files changed, 143 insertions(+), 490 deletions(-)

diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 5f841775ec..678f937abe 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -125,7 +125,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                             int minD, int maxD, CostType* cost,
                             PixType* buffer, const PixType* tab,
                             int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
-    //TODO: This function was changed and modified old function's behabior. Check they in tests
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@@ -311,9 +310,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     const CostType MAX_COST = SHRT_MAX;
 
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
-    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
+    Size SADWindowSize;
+    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
+    int ftzero = std::max(params.preFilterCap, 15) | 1;
     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
@@ -323,11 +322,11 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
     bool fullDP = params.mode == StereoSGBM::MODE_HH;
-    int npasses = fullDP ? 2 : 1;                                           //4e: 2 passes with different behaviour for Hirshmueller method.
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
+    int npasses = fullDP ? 2 : 1;
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
     PixType clipTab[TAB_SIZE];
 
-    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
+    for( k = 0; k < TAB_SIZE; k++ )
         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     if( minX1 >= maxX1 )
@@ -340,25 +339,25 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
     // if you change NR, please, modify the loop as well.
-    int D2 = D+16, NRD2 = NR2*D2;               //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
+    int D2 = D+16, NRD2 = NR2*D2;
 
     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
     // for 8-way dynamic programming we need the current row and
     // the previous row, i.e. 2 rows in total
-    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
-    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
+    const int NLR = 2;
+    const int LrBorder = NLR - 1;
 
     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*(fullDP ? height : 1);                           //4e: For HH mode it's better to keep whole array of costs.
-    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;            //4e: TODO: Understand why NR2 per pass instead od NR2/2 (Probably, without any reason. That doesn't make code wrong)
+    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
+    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff           //4e: TODO: Why we should increase sum window height one more time?
-    CSBufSize*2*sizeof(CostType) + // C, S                                          //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost  //4e: It is needed for calcPixelCostBT function, as "buffer" value
+    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
+    CSBufSize*2*sizeof(CostType) + // C, S
+    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 
     if( buffer.empty() || !buffer.isContinuous() ||
@@ -371,7 +370,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     CostType* hsumBuf = Sbuf + CSBufSize;
     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;     //4e: It is containers for backwards disparity, made by S[d] too, but with other method
+    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
     DispType* disp2ptr = (DispType*)(disp2cost + width);
     PixType* tempBuf = (PixType*)(disp2ptr + width);
 
@@ -383,21 +382,21 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     {
         int x1, y1, x2, y2, dx, dy;
 
-        if( pass == 1 )                     //4e: on the first pass, we work down, so calculate directions down, diagdown and right
+        if( pass == 1 )
         {
             y1 = 0; y2 = height; dy = 1;
             x1 = 0; x2 = width1; dx = 1;
         }
-        else                                //4e: on the second pass, we work up, so calculate directions up, diagup and up
+        else
         {
             y1 = height-1; y2 = -1; dy = -1;
             x1 = width1-1; x2 = -1; dx = -1;
         }
 
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+        CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 
-        for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
-        {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+        for( k = 0; k < NLR; k++ )
+        {
             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
             // and will occasionally use negative indices with the arrays
             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
@@ -418,28 +417,28 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
             {
-                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
+                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 
                 for( k = dy1; k <= dy2; k++ )
                 {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
+                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
 
                     if( k < height )
                     {
                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 
                         memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
+                        for( x = 0; x <= SW2*D; x += D )
                         {
                             int scale = x == 0 ? SW2 + 1 : 1;
                             for( d = 0; d < D; d++ )
                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
                         }
 
-                        if( y > 0 )                 //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
-                        {                           //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
+                        if( y > 0 )
+                        {
                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;    //4e: Well, actually y>0, so we don't need this check: y==0
+                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
 
                             for( x = D; x < width1*D; x += D )
                             {
@@ -475,8 +474,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-                            for( x = D; x < width1*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
-                            {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
+                            for( x = D; x < width1*D; x += D )
+                            {
                                 const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
                                 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 
@@ -486,7 +485,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                     }
 
-                    if( y == 0 )        //4e: Calculating first full block sum.
+                    if( y == 0 )
                     {
                         int scale = k == 0 ? SH2 + 1 : 1;
                         for( x = 0; x < width1*D; x++ )
@@ -495,13 +494,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 }
 
                 // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
+                for( k = 0; k < width1*D; k++ )
                     S[k] = 0;
             }
 
             // clear the left and the right borders
-            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );     //4e: To understand this "8" shifts and how they could work it's simpler to imagine pixel dislocation in memory
-            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );       //4e: ...00000000|NRD2-16 of real costs value(and some of them are zeroes too)|00000000...
+            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 
@@ -624,7 +623,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                     for( d = 0; d < D; d++ )
                     {
-                        int Cpd = Cp[d], L0, L1, L2, L3;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+                        int Cpd = Cp[d], L0, L1, L2, L3;
 
                         L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
                         L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
@@ -665,7 +664,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     CostType* Sp = S + x*D;
                     int minS = MAX_COST, bestDisp = -1;
 
-                    if( npasses == 1 )              //4e: in this case we could take fifth direction almost for free(direction "left")
+                    if( npasses == 1 )
                     {
                         int xm = x*NR2, xd = xm*D2;
 
@@ -815,7 +814,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
                 }
 
-                for( x = minX1; x < maxX1; x++ )        //4e: Left-right check itself
+                for( x = minX1; x < maxX1; x++ )
                 {
                     // we round the computed disparity both towards -inf and +inf and check
                     // if either of the corresponding disparities in disp2 is consistent.
@@ -826,8 +825,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     int _d = d1 >> DISP_SHIFT;
                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
                     int _x = x - _d, x_ = x - d_;
-                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
-                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
+                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
+                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
                 }
             }
@@ -840,8 +839,6 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////
-//TODO: Assumation: Let's pretend, that we allocate memory for pixDiff and tempBuf independently in each thread, with full size, needed for original calcBT
-//TODO: Redo size of this arrays even if situation with independent allocation will still.
 struct CalcVerticalSums: public ParallelLoopBody
 {
     CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
@@ -852,7 +849,7 @@ struct CalcVerticalSums: public ParallelLoopBody
         SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
         ftzero = std::max(params.preFilterCap, 15) | 1;
         P1 = params.P1 > 0 ? params.P1 : 2;
-        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
         height = img1.rows;
         width = img1.cols;
         int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@@ -861,7 +858,7 @@ struct CalcVerticalSums: public ParallelLoopBody
         D2 = D + 16;
         costBufSize = width1*D;
         CSBufSize = costBufSize*height;
-        minLrSize = (width1 + LrBorder*2);
+        minLrSize = width1;
         LrSize = minLrSize*D2;
         hsumBufNRows = SH2*2 + 2;
         Cbuf = alignedBuf;
@@ -874,10 +871,11 @@ struct CalcVerticalSums: public ParallelLoopBody
         static const CostType MAX_COST = SHRT_MAX;
         static const int ALIGN = 16;
         static const int TAB_OFS = 256*4;
+        static const int npasses = 2;
         int x1 = range.start, x2 = range.end, k;
         size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
         size_t auxBufsSize = pixDiffSize*sizeof(CostType) +                 //pixdiff size
-                             width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf //TODO: Probably it's better 6 instead of 16(alignment?)
+                             width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf
         Mat auxBuff;
         auxBuff.create(1, (int)auxBufsSize, CV_8U);
         CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
@@ -886,7 +884,7 @@ struct CalcVerticalSums: public ParallelLoopBody
         // Simplification of index calculation
         pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
 
-        for( int pass = 1; pass <= 2; pass++ ) //TODO: rename this magic 2.
+        for( int pass = 1; pass <= npasses; pass++ )
         {
             int y1, y2, dy;
 
@@ -899,18 +897,18 @@ struct CalcVerticalSums: public ParallelLoopBody
                 y1 = height-1; y2 = -1; dy = -1;
             }
 
-            CostType *Lr[NLR]={0}, *minLr[NLR]={0};             //4e: arrays for L(x,y,r,d) of previous and current rows and minimums of them
+            CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 
-            for( k = 0; k < NLR; k++ )                          //4e: One of them is needed, and one of them is stored. So, we need to swap pointer
-            {                                                   //4e: Yes, and this is done at the end of next cycle, not here.
+            for( k = 0; k < NLR; k++ )
+            {
                 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
                 // and will occasionally use negative indices with the arrays
                 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
                 // however, then the alignment will be imperfect, i.e. bad for SSE,
                 // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + D2*LrBorder + 8;
+                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;
                 memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
-                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k + LrBorder;
+                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;
                 memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
             }
 
@@ -922,60 +920,37 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
                 {
-                    int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;   //4e: for first line's block sum we need calculate half-window of costs and only one for other
+                    int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 
                     for( k = dy1; k <= dy2; k++ )
                     {
-                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;   //4e: Ring buffer for horizontally summed lines
+                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
 
                         if( k < height )
                         {
                             calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
 
                             memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
-                            for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )                //4e: Calculation summed costs for all disparities in first pixel of line
+                            for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )
                             {
                                 int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
                                 for( d = 0; d < D; d++ )
                                     hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
                             }
 
-                            if( y > 0 )             //4e: We calculate horizontal sums and forming full block sums for y coord by adding this horsums to previous line's sums and subtracting stored lowest
-                            {                       //4e: horsum in hsumBuf. Exception is case y=0, where we need many iterations per lines to create full blocking sum.
+                            if( y > 0 )
+                            {
                                 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
                                 const CostType* Cprev = C - costBufSize;
 
-                                // We need to calculate C[x1] in different way, because hsumadd is already calculated
-                                // We don't doing then for x==0, because original function has forgotten to do this  //TODO: Check: does this original still exist?
-                                if(x1!=0)
-                                {
-                                    for( d = 0; d < D; d++ )
-                                        C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
-                                }
+                                for( d = 0; d < D; d++ )
+                                    C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
 
                                 for( x = (x1+1)*D; x < x2*D; x += D )
                                 {
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 
-//                                #if CV_SIMD128
-//                                    if( useSIMD )
-//                                    {
-//                                        for( d = 0; d < D; d += 8 )
-//                                        {
-//                                            v_int16x8 hv = v_load(hsumAdd + x - D + d);
-//                                            v_int16x8 Cx = v_load(Cprev + x + d);
-//                                            v_int16x8 psub = v_load(pixSub + d);
-//                                            v_int16x8 padd = v_load(pixAdd + d);
-//                                            hv = (hv - psub + padd);
-//                                            psub = v_load(hsumSub + x + d);
-//                                            Cx = Cx - psub + hv;
-//                                            v_store(hsumAdd + x + d, hv);
-//                                            v_store(C + x + d, Cx);
-//                                        }
-//                                    }
-//                                    else
-//                                #endif
                                     {
                                         for( d = 0; d < D; d++ )
                                         {
@@ -987,8 +962,8 @@ struct CalcVerticalSums: public ParallelLoopBody
                             }
                             else
                             {
-                                for( x = (x1+1)*D; x < x2*D; x += D )  //4e: Calcluates horizontal sums if (y==0). This piece of code is calling SH2+1 times and then result is used in different way
-                                {                                   //4e: to create full blocks sum. That's why this code is isolated from upper case.
+                                for( x = (x1+1)*D; x < x2*D; x += D )
+                                {
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 
@@ -996,10 +971,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                                         hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
                                 }
                             }
-                            // Return to coordinates, which is needed by CalcCostBT
                         }
 
-                        if( y == 0 )        //4e: Calculating first full block sum.
+                        if( y == 0 )
                         {
                             int scale = k == 0 ? SH2 + 1 : 1;
                             for( x = x1*D; x < x2*D; x++ )
@@ -1008,26 +982,19 @@ struct CalcVerticalSums: public ParallelLoopBody
                     }
 
                     // also, clear the S buffer
-                    for( k = x1*D; k < x2*D; k++ )                                         //4e: only on first pass, so it keep old information, don't be confused
+                    for( k = x1*D; k < x2*D; k++ )
                         S[k] = 0;
                 }
 
-//                 [formula 13 in the paper]
-//                 compute L_r(p, d) = C(p, d) +
-//                 min(L_r(p-r, d),
-//                 L_r(p-r, d-1) + P1,
-//                 L_r(p-r, d+1) + P1,
-//                 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
-//                 where p = (x,y), r is one of the directions.
-//                 we process all the directions at once:
-//                 0: r=(-dx, 0)
-//                 1: r=(-1, -dy)
-//                 2: r=(0, -dy)
-//                 3: r=(1, -dy)
-//                 4: r=(-2, -dy)
-//                 5: r=(-1, -dy*2)
-//                 6: r=(1, -dy*2)
-//                 7: r=(2, -dy)
+//              [formula 13 in the paper]
+//              compute L_r(p, d) = C(p, d) +
+//              min(L_r(p-r, d),
+//              L_r(p-r, d-1) + P1,
+//              L_r(p-r, d+1) + P1,
+//              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+//              where p = (x,y), r is one of the directions.
+//              we process one directions on first pass and other on second:
+//              r=(0, dy), where dy=1 on first pass and dy=-1 on second
 
                 for( x = x1; x != x2; x++ )
                 {
@@ -1043,88 +1010,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                     const CostType* Cp = C + x*D;
                     CostType* Sp = S + x*D;
 
-//                #if CV_SIMD128
-//                    if( useSIMD )
-//                    {
-//                        v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                        v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                        v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                        v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                        v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                        v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                        for( d = 0; d < D; d += 8 )
-//                        {
-//                            v_int16x8 Cpd = v_load(Cp + d);
-//                            v_int16x8 L0, L1, L2, L3;
-//
-//                            L0 = v_load(Lr_p0 + d);
-//                            L1 = v_load(Lr_p1 + d);
-//                            L2 = v_load(Lr_ppr + d);
-//                            L3 = v_load(Lr_p3 + d);
-//
-//                            L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
-//                            L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
-//
-//                            L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                            L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                            L2 = v_min(L2, (v_load(Lr_ppr + d - 1) + _P1));
-//                            L2 = v_min(L2, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                            L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                            L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                            L0 = v_min(L0, _delta0);
-//                            L0 = ((L0 - _delta0) + Cpd);
-//
-//                            L1 = v_min(L1, _delta1);
-//                            L1 = ((L1 - _delta1) + Cpd);
-//
-//                            L2 = v_min(L2, _delta2);
-//                            L2 = ((L2 - _delta2) + Cpd);
-//
-//                            L3 = v_min(L3, _delta3);
-//                            L3 = ((L3 - _delta3) + Cpd);
-//
-//                            v_store(Lr_p + d, L0);
-//                            v_store(Lr_p + d + D2, L1);
-//                            v_store(Lr_p + d + D2*2, L2);
-//                            v_store(Lr_p + d + D2*3, L3);
-//
-//                            // Get minimum from in L0-L3
-//                            v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                            v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                            v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                            v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                            v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                            v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                            v_int16x8 t0 = v_min(t0123L, t0123H);
-//                            _minL0 = v_min(_minL0, t0);
-//
-//                            v_int16x8 Sval = v_load(Sp + d);
-//
-//                            L0 = L0 + L1;
-//                            L2 = L2 + L3;
-//                            Sval = Sval + L0;
-//                            Sval = Sval + L2;
-//
-//                            v_store(Sp + d, Sval);
-//                        }
-//
-//                        v_int32x4 minL, minH;
-//                        v_expand(_minL0, minL, minH);
-//                        v_pack_store(&minLr[0][x], v_min(minL, minH));
-//                    }
-//                    else
-//                    #endif
                     {
                         int minL = MAX_COST;
 
                         for( d = 0; d < D; d++ )
                         {
-                            int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+                            int Cpd = Cp[d], L;
 
                             L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
@@ -1144,7 +1035,6 @@ struct CalcVerticalSums: public ParallelLoopBody
         }
     }
     static const int NLR = 2;
-    static const int LrBorder = NLR - 1;
     const Mat& img1;
     const Mat& img2;
     CostType* Cbuf;
@@ -1178,7 +1068,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
         minD = params.minDisparity;
         maxD = minD + params.numDisparities;
         P1 = params.P1 > 0 ? params.P1 : 2;
-        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
         uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
         disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
         height = img1.rows;
@@ -1192,7 +1082,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
         costBufSize = width1*D;
         CSBufSize = costBufSize*height;
         D2 = D + 16;
-        LrSize = 2 * D2;               //TODO: Check: do we need this value or not?
+        LrSize = 2 * D2;
         Cbuf = alignedBuf;
         Sbuf = Cbuf + CSBufSize;
     }
@@ -1200,13 +1090,11 @@ struct CalcHorizontalSums: public ParallelLoopBody
     void operator()( const Range& range ) const
     {
         int y1 = range.start, y2 = range.end;
-        static const CostType MAX_COST = SHRT_MAX;
-        static const int ALIGN = 16;
-        size_t auxBufsSize = LrSize + width*(sizeof(CostType) + sizeof(DispType)) + 32;
+        size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;
 
         Mat auxBuff;
         auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType *Lr = (CostType*)alignPtr(auxBuff.ptr(), ALIGN) + 8;
+        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;
         CostType* disp2cost = Lr + LrSize;
         DispType* disp2ptr = (DispType*)(disp2cost + width);
 
@@ -1227,132 +1115,48 @@ struct CalcHorizontalSums: public ParallelLoopBody
 
             // clear buffers
             memset( Lr - 8, 0, LrSize*sizeof(CostType) );
+            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
+
             minLr = 0;
-            /*
-             [formula 13 in the paper]
-             compute L_r(p, d) = C(p, d) +
-             min(L_r(p-r, d),
-             L_r(p-r, d-1) + P1,
-             L_r(p-r, d+1) + P1,
-             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
-             where p = (x,y), r is one of the directions.
-             we process all the directions at once:
-             0: r=(-dx, 0)
-             1: r=(-1, -dy)
-             2: r=(0, -dy)
-             3: r=(1, -dy)
-             4: r=(-2, -dy)
-             5: r=(-1, -dy*2)
-             6: r=(1, -dy*2)
-             7: r=(2, -dy)
-             */
+//          [formula 13 in the paper]
+//          compute L_r(p, d) = C(p, d) +
+//          min(L_r(p-r, d),
+//          L_r(p-r, d-1) + P1,
+//          L_r(p-r, d+1) + P1,
+//          min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+//          where p = (x,y), r is one of the directions.
+//          we process all the directions at once:
+//          we process one directions on first pass and other on second:
+//          r=(dx, 0), where dx=1 on first pass and dx=-1 on second
             for( x = 0; x != width1; x++)
             {
                 int delta = minLr + P2;
 
                 CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
 
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;       //TODO: Well, probably, it's better do this once before.
-
                 CostType* Lr_p = Lr + ((x&1)? D2 :0);
                 const CostType* Cp = C + x*D;
                 CostType* Sp = S + x*D;
 
-//            #if CV_SIMD128
-//                if( useSIMD )
-//                {
-//                    v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                    for( d = 0; d < D; d += 8 )
-//                    {
-//                        v_int16x8 Cpd = v_load(Cp + d);
-//                        v_int16x8 L0, L1, L2, L3;
-//
-//                        L0 = v_load(Lr_ppr + d);
-//                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_p2 + d);
-//                        L3 = v_load(Lr_p3 + d);
-//
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
-//
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                        L0 = v_min(L0, _delta0);
-//                        L0 = ((L0 - _delta0) + Cpd);
-//
-//                        L1 = v_min(L1, _delta1);
-//                        L1 = ((L1 - _delta1) + Cpd);
-//
-//                        L2 = v_min(L2, _delta2);
-//                        L2 = ((L2 - _delta2) + Cpd);
-//
-//                        L3 = v_min(L3, _delta3);
-//                        L3 = ((L3 - _delta3) + Cpd);
-//
-//                        v_store(Lr_p + d, L0);
-//                        v_store(Lr_p + d + D2, L1);
-//                        v_store(Lr_p + d + D2*2, L2);
-//                        v_store(Lr_p + d + D2*3, L3);
-//
-//                        // Get minimum from in L0-L3
-//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                        v_int16x8 t0 = v_min(t0123L, t0123H);
-//                        _minL0 = v_min(_minL0, t0);
-//
-//                        v_int16x8 Sval = v_load(Sp + d);
-//
-//                        L0 = L0 + L1;
-//                        L2 = L2 + L3;
-//                        Sval = Sval + L0;
-//                        Sval = Sval + L2;
-//
-//                        v_store(Sp + d, Sval);
-//                    }
-//
-//                    v_int32x4 minL, minH;
-//                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[x], v_min(minL, minH));
-//                }
-//                else
-//            #endif
-                {
-                    int minL = MAX_COST;
+                int minL = MAX_COST;
 
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+                for( d = 0; d < D; d++ )
+                {
+                    int Cpd = Cp[d], L;
 
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                    L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
-                        Lr_p[d] = (CostType)L;
-                        minL = std::min(minL, L);
+                    Lr_p[d] = (CostType)L;
+                    minL = std::min(minL, L);
 
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                    }
-                    minLr = (CostType)minL;
+                    Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                 }
+                minLr = (CostType)minL;
             }
 
             memset( Lr - 8, 0, LrSize*sizeof(CostType) );
+            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
+
             minLr = 0;
 
             for( x = width1-1; x != -1; x--)
@@ -1361,136 +1165,30 @@ struct CalcHorizontalSums: public ParallelLoopBody
 
                 CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
 
-                Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
-
                 CostType* Lr_p = Lr + ((x&1)? D2 :0);
                 const CostType* Cp = C + x*D;
                 CostType* Sp = S + x*D;
                 int minS = MAX_COST, bestDisp = -1;
 
-//            #if CV_SIMD128
-//                if( useSIMD )
-//                {
-//                    v_int16x8 _P1 = v_setall_s16((short)P1);
-//
-//                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-//                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-//                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-//                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-//                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
-//
-//                    for( d = 0; d < D; d += 8 )
-//                    {
-//                        v_int16x8 Cpd = v_load(Cp + d);
-//                        v_int16x8 L0, L1, L2, L3;
-//
-//                        L0 = v_load(Lr_ppr + d);
-//                        L1 = v_load(Lr_p1 + d);
-//                        L2 = v_load(Lr_p2 + d);
-//                        L3 = v_load(Lr_p3 + d);
-//
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d - 1) + _P1));
-//                        L0 = v_min(L0, (v_load(Lr_ppr + d + 1) + _P1));
-//
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-//                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-//
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
-//                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
-//
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-//                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-//
-//                        L0 = v_min(L0, _delta0);
-//                        L0 = ((L0 - _delta0) + Cpd);
-//
-//                        L1 = v_min(L1, _delta1);
-//                        L1 = ((L1 - _delta1) + Cpd);
-//
-//                        L2 = v_min(L2, _delta2);
-//                        L2 = ((L2 - _delta2) + Cpd);
-//
-//                        L3 = v_min(L3, _delta3);
-//                        L3 = ((L3 - _delta3) + Cpd);
-//
-//                        v_store(Lr_p + d, L0);
-//                        v_store(Lr_p + d + D2, L1);
-//                        v_store(Lr_p + d + D2*2, L2);
-//                        v_store(Lr_p + d + D2*3, L3);
-//
-//                        // Get minimum from in L0-L3
-//                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-//                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-//                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-//                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-//                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-//                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-//                        v_int16x8 t0 = v_min(t0123L, t0123H);
-//                        _minL0 = v_min(_minL0, t0);
-//
-//                        v_int16x8 Sval = v_load(Sp + d);
-//
-//                        L0 = L0 + L1;
-//                        L2 = L2 + L3;
-//                        Sval = Sval + L0;
-//                        Sval = Sval + L2;
-//
-//                        v_store(Sp + d, Sval);
-//                    }
-//
-//                    v_int32x4 minL, minH;
-//                    v_expand(_minL0, minL, minH);
-//                    v_pack_store(&minLr[x], v_min(minL, minH));
-//                }
-//                else
-//            #endif
-//TODO:Next piece of code is came from postprocessing. Be very careful with joining them!!!
-//                #if CV_SIMD128
-//                    if( useSIMD )
-//                    {
-//                        v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
-//                        v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
-//
-//                        for( d = 0; d < D; d+= 8 )
-//                        {
-//                            v_int16x8 L0 = v_load(Sp + d);
-//                            v_int16x8 mask = L0 < _minS;
-//                            _minS = v_min( L0, _minS );
-//                            _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
-//                            _d8 = _d8 + _8;
-//                        }
-//                        v_int32x4 _d0, _d1;
-//                        v_expand(_minS, _d0, _d1);
-//                        minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-//                        v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
-//
-//                        _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
-//                        v_expand(_bestDisp, _d0, _d1);
-//                        bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-//                    }
-//                    else
-//                #endif
-                {
-                    int minL = MAX_COST;
+                int minL = MAX_COST;
 
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;   //4e: Remember, that every Cp is increased on P2 in line number 369. That's why next 4 lines are paper-like actually
+                for( d = 0; d < D; d++ )
+                {
+                    int Cpd = Cp[d], L;
 
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                    L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
-                        Lr_p[d] = (CostType)L;
-                        minL = std::min(minL, L);
+                    Lr_p[d] = (CostType)L;
+                    minL = std::min(minL, L);
 
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                        if( Sp[d] < minS )
-                        {
-                            minS = Sp[d];
-                            bestDisp = d;
-                        }
+                    Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                    if( Sp[d] < minS )
+                    {
+                        minS = Sp[d];
+                        bestDisp = d;
                     }
-                    minLr = (CostType)minL;
                 }
+                    minLr = (CostType)minL;
                 //Some postprocessing procedures and saving
                 for( d = 0; d < D; d++ )
                 {
@@ -1531,17 +1229,17 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 int _d = d1 >> DISP_SHIFT;
                 int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
                 int _x = x - _d, x_ = x - d_;
-                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&         //4e: To dismiss disparity, we should assure, that there is no any
-                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )           //4e: chance to understand this as correct.
+                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
+                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
                     disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
             }
         }
     }
 
-    static const int NLR = 2;
-    static const int LrBorder = NLR - 1;
     static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     static const int DISP_SCALE = (1 << DISP_SHIFT);
+    static const CostType MAX_COST = SHRT_MAX;
+    static const int ALIGN = 16;
     const Mat& img1;
     const Mat& img2;
     Mat& disp1;
@@ -1567,68 +1265,41 @@ struct CalcHorizontalSums: public ParallelLoopBody
     int disp12MaxDiff;
 };
 /*
- This is new experimential version of disparity calculation, which should be parralled after
-TODO: Don't forget to rewrire this commentaries after
-
  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
  minD <= d < maxD.
- disp2full is the reverse disparity map, that is:
- disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 
  note that disp1buf will have the same size as the roi and
- disp2full will have the same size as img1 (or img2).
- On exit disp2buf is not the final disparity, it is an intermediate result that becomes
+ On exit disp1buf is not the final disparity, it is an intermediate result that becomes
  final after all the tiles are processed.
 
  the disparity in disp1buf is written with sub-pixel accuracy
  (4 fractional bits, see StereoSGBM::DISP_SCALE),
  using quadratic interpolation, while the disparity in disp2buf
  is written as is, without interpolation.
-
- disp2cost also has the same size as img1 (or img2).
- It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
-static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
+static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
 {
-//#if CV_SIMD128
-//    // maxDisparity is supposed to multiple of 16, so we can forget doing else
-//    static const uchar LSBTab[] =
-//    {
-//        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-//        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-//    };
-//    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-//
-//    bool useSIMD = hasSIMD128();
-//#endif
-
     const int ALIGN = 16;
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;                                                                                 //4e: SAD means Sum of Absolute Differences
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;   //4e: and this is always square
-    int ftzero = std::max(params.preFilterCap, 15) | 1;                             //4e:ftzero clips x-derivatives. I think, this story with arrays is about non-realized SIMD method
-    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);   //TODO: think about P1/S(x,y) Proportion
+    Size SADWindowSize;
+    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
+    int ftzero = std::max(params.preFilterCap, 15) | 1;
+    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     int D = maxD - minD, width1 = maxX1 - minX1;
     int SH2 = SADWindowSize.height/2;
     int INVALID_DISP = minD - 1;
     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;                  //4e: array is such big due to derivative could be +-8*256 in worst cases
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
     PixType clipTab[TAB_SIZE];
 
-    for( k = 0; k < TAB_SIZE; k++ )         //4e: If ftzero would = 4, array containment will be = -4 -4 -4 ... -4 -3 -2 -1 0 1 2 3 4 ... 4 4 4
+    for( k = 0; k < TAB_SIZE; k++ )
         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     if( minX1 >= maxX1 )
@@ -1637,28 +1308,25 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
         return;
     }
 
-    CV_Assert( D % 16 == 0 );                    //TODO: Are you sure? By the way, why not 8?
+    CV_Assert( D % 16 == 0 );
 
-    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
-    // if you change NR, please, modify the loop as well.
-    int D2 = D+16;                  //4e: Somewhere in code we need d+1, so D+1. One of simplest solutuons is increasing D-dimension on 1. But 1 is 16, when storage should be aligned.
+    int D2 = D+16;
 
     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
+    // for dynamic programming we need the current row and
     // the previous row, i.e. 2 rows in total
-    const int NLR = 2;                          //4e: We assume, that we need one or more previous steps in our linear dynamic(one right here).
-    const int LrBorder = NLR - 1;               //4e: for simplification of calculations we need border for taking previous dynamic solutions.
+    const int NLR = 2;
 
     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
+    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
     size_t CSBufSize = costBufSize*height;
-    size_t minLrSize = (width1 + LrBorder*2), LrSize = minLrSize*D2;        //TODO: We don't need LrBorder for vertical passes and we don't need Lr buffer for horizontal passes.
+    size_t minLrSize = width1 , LrSize = minLrSize*D2;
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf                    //4e: TODO: Why we should increase sum window height one more time?
-    CSBufSize*2*sizeof(CostType) + 1024;                              // C, S                       //4e: C is Block sum of costs, S is multidirectional dynamic sum with same size
+    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf
+    CSBufSize*2*sizeof(CostType) + 1024;                              // C, S
 
     if( buffer.empty() || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
@@ -1671,8 +1339,8 @@ static void computeDisparitySGBMParallel( const Mat& img1, const Mat& img2,
     for(k = 0; k < (int)CSBufSize; k++ )
         Cbuf[k] = (CostType)P2;
 
-    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab));
-    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf));
+    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
+    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
 
 }
 
@@ -2331,7 +1999,7 @@ public:
         if(params.mode==MODE_SGBM_3WAY)
             computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
         else if(params.mode==MODE_HH4)
-            computeDisparitySGBMParallel( left, right, disp, params, buffer );
+            computeDisparitySGBM_HH4( left, right, disp, params, buffer );
         else
             computeDisparitySGBM( left, right, disp, params, buffer );
 
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index 9a338bb11f..08a955bb3a 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -785,37 +785,22 @@ protected:
 TEST(Calib3d_StereoBM, regression) { CV_StereoBMTest test; test.safe_run(); }
 TEST(Calib3d_StereoSGBM, regression) { CV_StereoSGBMTest test; test.safe_run(); }
 
-TEST(Calib3d_StereoSGBMPar, idontknowhowtotesthere)
+TEST(Calib3d_StereoSGBM_HH4, regression)
 {
-//	<!-- caseName, datasetName, numDisp, winSize, mode -->
-//	case_teddy_2 teddy "48" "3" "MODE_HH"
-
-//Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
-//                                 int P1, int P2, int disp12MaxDiff,
-//                                 int preFilterCap, int uniquenessRatio,
-//                                 int speckleWindowSize, int speckleRange,
-//                                 int mode)
-    Mat leftImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im2.png");
-    Mat rightImg = imread("/home/q/Work/GitVault/opencv_extra/testdata/cv/stereomatching/datasets/teddy/im6.png");
-//    Mat leftDisp_old, leftDisp_new;
-    {
-        Mat leftDisp;
-        Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH);
-        sgbm->compute( leftImg, rightImg, leftDisp);
-        CV_Assert( leftDisp.type() == CV_16SC1 );
-        leftDisp/=8;
-        imwrite( "/home/q/Work/GitVault/modehh4_new.jpg", leftDisp);
-    }
+    String path = cvtest::TS::ptr()->get_data_path() + "cv/stereomatching/datasets/teddy/";
+    Mat leftImg = imread(path + "im2.png", 0);
+    Mat rightImg = imread(path + "im6.png", 0);
+    Mat testData = imread(path + "disp2_hh4.png",-1);
+    Mat leftDisp;
+    Mat toCheck;
     {
-        Mat leftDisp;
         Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
         sgbm->compute( leftImg, rightImg, leftDisp);
         CV_Assert( leftDisp.type() == CV_16SC1 );
-        leftDisp/=8;
-        imwrite( "/home/q/Work/GitVault/modehh4_old.jpg", leftDisp);
+        leftDisp.convertTo(toCheck, CV_16UC1);
+//        imwrite("/home/q/Work/GitVault/disp2_hh4.png",toCheck);
     }
-//    Mat diff;
-//    absdiff(leftDisp_old,leftDisp_new,diff);
-//    CV_Assert( countNonZero(diff)==0);
-//
+    Mat diff;
+    absdiff(toCheck, testData,diff);
+    CV_Assert( countNonZero(diff)==0);
 }

From d6bc6895a6ea2c5958bc9722e0adaeab4e37b4f1 Mon Sep 17 00:00:00 2001
From: Pyotr Chekmaryov <4ekmah@gmail.com>
Date: Thu, 27 Apr 2017 10:05:55 +0000
Subject: [PATCH 6/6] Test data correction.

---
 modules/calib3d/test/test_stereomatching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index 08a955bb3a..d4f20b163d 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -797,8 +797,7 @@ TEST(Calib3d_StereoSGBM_HH4, regression)
         Ptr<StereoSGBM> sgbm = StereoSGBM::create( 0, 48, 3, 90, 360, 1, 63, 10, 100, 32, StereoSGBM::MODE_HH4);
         sgbm->compute( leftImg, rightImg, leftDisp);
         CV_Assert( leftDisp.type() == CV_16SC1 );
-        leftDisp.convertTo(toCheck, CV_16UC1);
-//        imwrite("/home/q/Work/GitVault/disp2_hh4.png",toCheck);
+        leftDisp.convertTo(toCheck, CV_16UC1,1,16);
     }
     Mat diff;
     absdiff(toCheck, testData,diff);