diff --git a/3rdparty/ippicv/ippicv.cmake b/3rdparty/ippicv/ippicv.cmake
index ae8748c283..257af6fcc6 100644
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@@ -2,37 +2,32 @@ function(download_ippicv root_var)
   set(${root_var} "" PARENT_SCOPE)
 
   # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "32e315a5b106a7b89dbed51c28f8120a48b368b4")
+  set(IPPICV_COMMIT "a56b6ac6f030c312b2dce17430eef13aed9af274")
   # Define actual ICV versions
   if(APPLE)
     set(OPENCV_ICV_PLATFORM "macosx")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
-    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_mac_intel64_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "fe6b2bb75ae0e3f19ad3ae1a31dfa4a2")
-    else()
-      set(OPENCV_ICV_NAME "ippicv_2019_mac_ia32_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "b5dfa78c87eb75c64470cbe5ec876f4f")
-    endif()
+    set(OPENCV_ICV_NAME "ippicv_2020_mac_intel64_20191018_general.tgz")
+    set(OPENCV_ICV_HASH "1c3d675c2a2395d094d523024896e01b")
   elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
     set(OPENCV_ICV_PLATFORM "linux")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_lnx_intel64_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "c0bd78adb4156bbf552c1dfe90599607")
+      set(OPENCV_ICV_NAME "ippicv_2020_lnx_intel64_20191018_general.tgz")
+      set(OPENCV_ICV_HASH "7421de0095c7a39162ae13a6098782f9")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2019_lnx_ia32_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "4f38432c30bfd6423164b7a24bbc98a0")
+      set(OPENCV_ICV_NAME "ippicv_2020_lnx_ia32_20191018_general.tgz")
+      set(OPENCV_ICV_HASH "ad189a940fb60eb71f291321322fe3e8")
     endif()
   elseif(WIN32 AND NOT ARM)
     set(OPENCV_ICV_PLATFORM "windows")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_win_intel64_20180723_general.zip")
-      set(OPENCV_ICV_HASH "1d222685246896fe089f88b8858e4b2f")
+      set(OPENCV_ICV_NAME "ippicv_2020_win_intel64_20191018_general.zip")
+      set(OPENCV_ICV_HASH "879741a7946b814455eee6c6ffde2984")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2019_win_ia32_20180723_general.zip")
-      set(OPENCV_ICV_HASH "0157251a2eb9cd63a3ebc7eed0f3e59e")
+      set(OPENCV_ICV_NAME "ippicv_2020_win_ia32_20191018_general.zip")
+      set(OPENCV_ICV_HASH "cd39bdf0c2e1cac9a61101dad7a2413e")
     endif()
   else()
     return()
diff --git a/3rdparty/openexr/Imath/ImathQuat.h b/3rdparty/openexr/Imath/ImathQuat.h
index e95e356d59..e01d10b7c3 100644
--- a/3rdparty/openexr/Imath/ImathQuat.h
+++ b/3rdparty/openexr/Imath/ImathQuat.h
@@ -60,6 +60,7 @@
 #include "ImathNamespace.h"
 
 #include <iostream>
+#include <algorithm>
 
 IMATH_INTERNAL_NAMESPACE_HEADER_ENTER
 
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index f938e21a57..79555f60d9 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -236,6 +236,10 @@ if(DEFINED ENV{OPENCV_IPP_PATH} AND NOT DEFINED IPPROOT)
 endif()
 
 if(NOT DEFINED IPPROOT)
+  if(APPLE AND NOT IPP_X64)
+    message(STATUS "IPPICV: 32-bit binaries are not supported on Apple platform (MacOSX)")
+    return()
+  endif()
   include("${OpenCV_SOURCE_DIR}/3rdparty/ippicv/ippicv.cmake")
   download_ippicv(ICV_PACKAGE_ROOT)
   if(NOT ICV_PACKAGE_ROOT)
diff --git a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
index 4005bbff3e..f43790c297 100644
--- a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
+++ b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
@@ -25,7 +25,13 @@ version of it ](https://github.com/opencv/opencv/tree/master/samples/data/Megami
 You may also find the source code and these video file in the
 `samples/data` folder of the OpenCV source library.
 
+@add_toggle_cpp
 @include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
+@end_toggle
+
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
+@end_toggle
 
 How to read a video stream (online-camera or offline-file)?
 -----------------------------------------------------------
@@ -139,28 +145,15 @@ an invalid divide by zero operation in the PSNR formula. In this case the PSNR i
 we'll need to handle this case separately. The transition to a logarithmic scale is made because the
 pixel values have a very wide dynamic range. All this translated to OpenCV and a C++ function looks
 like:
-@code{.cpp}
-double getPSNR(const Mat& I1, const Mat& I2)
-{
- Mat s1;
- absdiff(I1, I2, s1);       // |I1 - I2|
- s1.convertTo(s1, CV_32F);  // cannot make a square on 8 bits
- s1 = s1.mul(s1);           // |I1 - I2|^2
-
- Scalar s = sum(s1);        // sum elements per channel
-
- double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels
-
- if( sse <= 1e-10) // for small values return zero
-     return 0;
- else
- {
-     double  mse =sse /(double)(I1.channels() * I1.total());
-     double psnr = 10.0*log10((255*255)/mse);
-     return psnr;
- }
-}
-@endcode
+
+@add_toggle_cpp
+@include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp get-psnr
+@end_toggle
+
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py get-psnr
+@end_toggle
+
 Typically result values are anywhere between 30 and 50 for video compression, where higher is
 better. If the images significantly differ you'll get much lower ones like 15 and so. This
 similarity check is easy and fast to calculate, however in practice it may turn out somewhat
@@ -176,60 +169,14 @@ implementation below.
     Simoncelli, "Image quality assessment: From error visibility to structural similarity," IEEE
     Transactions on Image Processing, vol. 13, no. 4, pp. 600-612, Apr. 2004." article.
 
-@code{.cpp}
-Scalar getMSSIM( const Mat& i1, const Mat& i2)
-{
- const double C1 = 6.5025, C2 = 58.5225;
- /***************************** INITS **********************************/
- int d     = CV_32F;
-
- Mat I1, I2;
- i1.convertTo(I1, d);           // cannot calculate on one byte large values
- i2.convertTo(I2, d);
-
- Mat I2_2   = I2.mul(I2);        // I2^2
- Mat I1_2   = I1.mul(I1);        // I1^2
- Mat I1_I2  = I1.mul(I2);        // I1 * I2
+@add_toggle_cpp
+@include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp get-mssim
+@end_toggle
 
- /***********************PRELIMINARY COMPUTING ******************************/
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py get-mssim
+@end_toggle
 
- Mat mu1, mu2;   //
- GaussianBlur(I1, mu1, Size(11, 11), 1.5);
- GaussianBlur(I2, mu2, Size(11, 11), 1.5);
-
- Mat mu1_2   =   mu1.mul(mu1);
- Mat mu2_2   =   mu2.mul(mu2);
- Mat mu1_mu2 =   mu1.mul(mu2);
-
- Mat sigma1_2, sigma2_2, sigma12;
-
- GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
- sigma1_2 -= mu1_2;
-
- GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
- sigma2_2 -= mu2_2;
-
- GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
- sigma12 -= mu1_mu2;
-
- ///////////////////////////////// FORMULA ////////////////////////////////
- Mat t1, t2, t3;
-
- t1 = 2 * mu1_mu2 + C1;
- t2 = 2 * sigma12 + C2;
- t3 = t1.mul(t2);              // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
-
- t1 = mu1_2 + mu2_2 + C1;
- t2 = sigma1_2 + sigma2_2 + C2;
- t1 = t1.mul(t2);               // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
-
- Mat ssim_map;
- divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
-
- Scalar mssim = mean( ssim_map ); // mssim = average of ssim map
- return mssim;
-}
-@endcode
 This will return a similarity index for each channel of the image. This value is between zero and
 one, where one corresponds to perfect fit. Unfortunately, the many Gaussian blurring is quite
 costly, so while the PSNR may work in a real time like environment (24 frame per second) this will
diff --git a/modules/calib3d/src/rho.cpp b/modules/calib3d/src/rho.cpp
index 3cfa6b19e8..341b6b9063 100644
--- a/modules/calib3d/src/rho.cpp
+++ b/modules/calib3d/src/rho.cpp
@@ -55,7 +55,7 @@
 #include <math.h>
 #include <vector>
 #include "rho.h"
-
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 
 
@@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */
 
 
 /* Constants */
-const int    MEM_ALIGN              = 32;
 const size_t HSIZE                  = (3*3*sizeof(float));
 const double MIN_DELTA_CHNG         = 0.1;
 // const double CHI_STAT               = 2.706;
@@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{
 
     /* Levenberg-Marquardt Refinement */
     struct{
-        float  (* JtJ)[8];         /* JtJ matrix */
-        float  (* tmp1)[8];        /* Temporary 1 */
+        float*    JtJ;             /* JtJ matrix */
+        float*    tmp1;            /* Temporary 1 */
         float*    Jte;             /* Jte vector */
     } lm;
 
     /* Memory Management */
-    struct{
-        cv::Mat perObj;
-        cv::Mat perRun;
-    } mem;
+    utils::BufferArea runArea;
+    utils::BufferArea objArea;
 
     /* Initialized? */
     int initialized;
@@ -659,16 +656,9 @@ inline int    RHO_HEST_REFC::initialize(void){
 
     fastSeed((uint64_t)~0);
 
+    initialized = 1;
 
-    int areAllAllocsSuccessful = !mem.perObj.empty();
-
-    if(!areAllAllocsSuccessful){
-        finalize();
-    }else{
-        initialized = 1;
-    }
-
-    return areAllAllocsSuccessful;
+    return true;
 }
 
 /**
@@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float*   src,     /* Source points */
  */
 
 inline void   RHO_HEST_REFC::allocatePerObj(void){
-    /* We have known sizes */
-    size_t ctrl_smpl_sz   = SMPL_SIZE*sizeof(*ctrl.smpl);
-    size_t curr_pkdPts_sz = SMPL_SIZE*2*2*sizeof(*curr.pkdPts);
-    size_t curr_H_sz      = HSIZE;
-    size_t best_H_sz      = HSIZE;
-    size_t lm_JtJ_sz      = 8*8*sizeof(float);
-    size_t lm_tmp1_sz     = 8*8*sizeof(float);
-    size_t lm_Jte_sz      = 1*8*sizeof(float);
-
-    /* We compute offsets */
-    size_t total = 0;
-#define MK_OFFSET(v)                                     \
-    size_t v ## _of = total;                             \
-    total = alignSize(v ## _of  +  v ## _sz, MEM_ALIGN)
-
-    MK_OFFSET(ctrl_smpl);
-    MK_OFFSET(curr_pkdPts);
-    MK_OFFSET(curr_H);
-    MK_OFFSET(best_H);
-    MK_OFFSET(lm_JtJ);
-    MK_OFFSET(lm_tmp1);
-    MK_OFFSET(lm_Jte);
-
-#undef MK_OFFSET
-
-    /* Allocate dynamic memory managed by cv::Mat */
-    mem.perObj.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
-
-    /* Extract aligned pointer */
-    unsigned char* ptr = alignPtr(mem.perObj.data, MEM_ALIGN);
-
-    /* Assign pointers */
-    ctrl.smpl   = (unsigned*)  (ptr + ctrl_smpl_of);
-    curr.pkdPts = (float*)     (ptr + curr_pkdPts_of);
-    curr.H      = (float*)     (ptr + curr_H_of);
-    best.H      = (float*)     (ptr + best_H_of);
-    lm.JtJ      = (float(*)[8])(ptr + lm_JtJ_of);
-    lm.tmp1     = (float(*)[8])(ptr + lm_tmp1_of);
-    lm.Jte      = (float*)     (ptr + lm_Jte_of);
+    objArea.allocate(ctrl.smpl, SMPL_SIZE);
+    objArea.allocate(curr.pkdPts, SMPL_SIZE*2*2);
+    objArea.allocate(curr.H, HSIZE);
+    objArea.allocate(best.H, HSIZE);
+    objArea.allocate(lm.JtJ, 8*8);
+    objArea.allocate(lm.tmp1, 8*8);
+    objArea.allocate(lm.Jte, 1*8);
+    objArea.commit();
 }
 
 
@@ -885,30 +844,9 @@ inline void   RHO_HEST_REFC::allocatePerObj(void){
  */
 
 inline void   RHO_HEST_REFC::allocatePerRun(void){
-    /* We have known sizes */
-    size_t best_inl_sz = arg.N;
-    size_t curr_inl_sz = arg.N;
-
-    /* We compute offsets */
-    size_t total = 0;
-#define MK_OFFSET(v)                                     \
-    size_t v ## _of = total;                             \
-    total = alignSize(v ## _of  +  v ## _sz, MEM_ALIGN)
-
-    MK_OFFSET(best_inl);
-    MK_OFFSET(curr_inl);
-
-#undef MK_OFFSET
-
-    /* Allocate dynamic memory managed by cv::Mat */
-    mem.perRun.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
-
-    /* Extract aligned pointer */
-    unsigned char* ptr = alignPtr(mem.perRun.data, MEM_ALIGN);
-
-    /* Assign pointers */
-    best.inl  = (char*)(ptr + best_inl_of);
-    curr.inl  = (char*)(ptr + curr_inl_of);
+    runArea.allocate(best.inl, arg.N);
+    runArea.allocate(curr.inl, arg.N);
+    runArea.commit();
 }
 
 
@@ -919,10 +857,7 @@ inline void   RHO_HEST_REFC::allocatePerRun(void){
  */
 
 inline void   RHO_HEST_REFC::deallocatePerRun(void){
-    best.inl  = NULL;
-    curr.inl  = NULL;
-
-    mem.perRun.release();
+    runArea.release();
 }
 
 
@@ -933,15 +868,7 @@ inline void   RHO_HEST_REFC::deallocatePerRun(void){
  */
 
 inline void   RHO_HEST_REFC::deallocatePerObj(void){
-    ctrl.smpl   = NULL;
-    curr.pkdPts = NULL;
-    curr.H      = NULL;
-    best.H      = NULL;
-    lm.JtJ      = NULL;
-    lm.tmp1     = NULL;
-    lm.Jte      = NULL;
-
-    mem.perObj.release();
+    objArea.release();
 }
 
 
@@ -2144,7 +2071,7 @@ inline void   RHO_HEST_REFC::refine(void){
      */
     /* Find initial conditions */
     sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
-                          lm.JtJ, lm.Jte,  &S);
+                          (float(*)[8])lm.JtJ, lm.Jte,  &S);
 
     /*Levenberg-Marquardt Loop.*/
     for(i=0;i<MAXLEVMARQITERS;i++){
@@ -2169,11 +2096,11 @@ inline void   RHO_HEST_REFC::refine(void){
          * transpose) then multiply Jte in order to find dH.
          */
 
-        while(!sacChol8x8Damped(lm.JtJ, L, lm.tmp1)){
+        while(!sacChol8x8Damped((float(*)[8])lm.JtJ, L, (float(*)[8])lm.tmp1)){
             L *= 2.0f;
         }
-        sacTRInv8x8   (lm.tmp1, lm.tmp1);
-        sacTRISolve8x8(lm.tmp1, lm.Jte,  dH);
+        sacTRInv8x8   ((float(*)[8])lm.tmp1, (float(*)[8])lm.tmp1);
+        sacTRISolve8x8((float(*)[8])lm.tmp1, lm.Jte,  dH);
         sacSub8x1     (newH,       best.H,  dH);
         sacCalcJacobianErrors(newH, arg.src, arg.dst, best.inl, arg.N,
                               NULL, NULL, &newS);
@@ -2204,7 +2131,7 @@ inline void   RHO_HEST_REFC::refine(void){
             S = newS;
             memcpy(best.H, newH, sizeof(newH));
             sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
-                                  lm.JtJ, lm.Jte,  &S);
+                                  (float(*)[8])lm.JtJ, lm.Jte,  &S);
         }
     }
 }
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 3b721ccf66..7d5d23c18d 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -53,6 +53,7 @@
 #include "precomp.hpp"
 #include <limits.h>
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 namespace cv
 {
@@ -99,6 +100,16 @@ struct StereoSGBMParams
         mode = _mode;
     }
 
+    inline bool isFullDP() const
+    {
+        return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
+    }
+    inline Size calcSADWindowSize() const
+    {
+        const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
+        return Size(dim, dim);
+    }
+
     int minDisparity;
     int numDisparities;
     int SADWindowSize;
@@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va
 #endif
 
 static const int DEFAULT_RIGHT_BORDER = -1;
+
 /*
  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
  and for each disparity minD<=d<maxD the function
@@ -161,7 +173,7 @@ static const int DEFAULT_RIGHT_BORDER = -1;
 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                             int minD, int maxD, CostType* cost,
                             PixType* buffer, const PixType* tab,
-                            int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
+                            int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@@ -178,8 +190,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 
-    tab += tabOfs;
-
     for( c = 0; c < cn*2; c++ )
     {
         prow1[width*c] = prow1[width*c + width-1] =
@@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 }
 
 
+
+class BufferSGBM
+{
+private:
+    size_t width1;
+    size_t Da;
+    size_t Dlra;
+    size_t costWidth;
+    size_t costHeight;
+    size_t hsumRows;
+    bool fullDP;
+    uchar dirs;
+    uchar dirs2;
+    static const size_t TAB_OFS = 256*4;
+
+public:
+    CostType* Cbuf;
+    CostType* Sbuf;
+    CostType* hsumBuf;
+    CostType* pixDiff;
+    CostType* disp2cost;
+    DispType* disp2ptr;
+    PixType* tempBuf;
+    std::vector<CostType*> Lr;
+    std::vector<CostType*> minLr;
+    PixType * clipTab;
+
+private:
+    utils::BufferArea area;
+
+public:
+    BufferSGBM(size_t width1_,
+               size_t Da_,
+               size_t Dlra_,
+               size_t cn,
+               size_t width,
+               size_t height,
+               const StereoSGBMParams &params)
+        : width1(width1_),
+        Da(Da_),
+        Dlra(Dlra_),
+        Cbuf(NULL),
+        Sbuf(NULL),
+        hsumBuf(NULL),
+        pixDiff(NULL),
+        disp2cost(NULL),
+        disp2ptr(NULL),
+        tempBuf(NULL),
+        Lr(2, (CostType*)NULL),
+        minLr(2, (CostType*)NULL),
+        clipTab(NULL)
+    {
+        const size_t TAB_SIZE = 256 + TAB_OFS*2;
+        fullDP = params.isFullDP();
+        costWidth = width1 * Da;
+        costHeight = fullDP ? height : 1;
+        hsumRows = params.calcSADWindowSize().height + 2;
+        dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
+        dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
+        // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+        // we keep pixel difference cost (C) and the summary cost over NR directions (S).
+        // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+        area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
+        area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
+        area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
+        area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
+        area.allocate(disp2cost,    width, CV_SIMD_WIDTH);
+        area.allocate(disp2ptr,     width, CV_SIMD_WIDTH);
+        area.allocate(tempBuf,      width * (4 * cn + 2), CV_SIMD_WIDTH);
+        // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+        // for 8-way dynamic programming we need the current row and
+        // the previous row, i.e. 2 rows in total
+        for (size_t i = 0; i < 2; ++i)
+        {
+            // 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
+            area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
+            // 1D: [ NR ][ w1 * NR2 ][ NR ]
+            area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
+        }
+        area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
+        area.commit();
+
+        // init clipTab
+        const int ftzero = std::max(params.preFilterCap, 15) | 1;
+        for(int i = 0; i < (int)TAB_SIZE; i++ )
+            clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
+    }
+    inline const PixType * getClipTab() const
+    {
+        return clipTab + TAB_OFS;
+    }
+    inline void initCBuf(CostType val) const
+    {
+        for (size_t i = 0; i < costWidth * costHeight; ++i)
+            Cbuf[i] = val;
+    }
+    inline void clearLr(const Range & range = Range::all()) const
+    {
+            for (uchar i = 0; i < 2; ++i)
+            {
+                if (range == Range::all())
+                {
+                    memset(Lr[i],    0, calcLrCount() * Dlra * sizeof(CostType));
+                    memset(minLr[i], 0, calcLrCount()        * sizeof(CostType));
+                }
+                else
+                {
+                    memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
+                    memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
+                }
+            }
+    }
+    inline size_t calcLrCount() const
+    {
+        return width1 * dirs2 + 2 * dirs;
+    }
+    inline void swapLr()
+    {
+        std::swap(Lr[0], Lr[1]);
+        std::swap(minLr[0], minLr[1]);
+    }
+    inline CostType * getHSumBuf(int row) const
+    {
+        return hsumBuf + (row % hsumRows) * costWidth;
+    }
+    inline CostType * getCBuf(int row) const
+    {
+        CV_Assert(row >= 0);
+        return Cbuf + (!fullDP ? 0 : (row * costWidth));
+    }
+    inline CostType * getSBuf(int row) const
+    {
+        CV_Assert(row >= 0);
+        return Sbuf + (!fullDP ? 0 : (row * costWidth));
+    }
+    inline void clearSBuf(int row, const Range & range = Range::all()) const
+    {
+        if (range == Range::all())
+            memset(getSBuf(row), 0, costWidth * sizeof(CostType));
+        else
+            memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
+    }
+
+    // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+    // and will occasionally use negative indices with the arrays
+    // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+    inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
+    {
+        CV_Assert(id < 2);
+        const size_t fixed_offset = dirs * Dlra;
+        return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
+    }
+    inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
+    {
+        CV_Assert(id < 2);
+        const size_t fixed_offset = dirs;
+        return minLr[id] + fixed_offset + (idx * dirs2 + shift);
+    }
+};
+
 /*
  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
@@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoSGBMParams& params,
-                                 Mat& buffer )
+                                 Mat& disp1, const StereoSGBMParams& params )
 {
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     const CostType MAX_COST = SHRT_MAX;
 
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
+    const int D = params.numDisparities;
+    int width1 = maxX1 - minX1;
     int Da = (int)alignSize(D, v_int16::nlanes);
     int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
-    bool fullDP = params.mode == StereoSGBM::MODE_HH;
-    int npasses = fullDP ? 2 : 1;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+    int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
+    int npasses = params.isFullDP() ? 2 : 1;
 
     if( minX1 >= maxX1 )
     {
@@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
         return;
     }
 
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-    size_t costBufSize = width1*Da;
-    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
-    size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra;
-    int hsumBufNRows = SH2*2 + 2;
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
-    ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[]
-    width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2
-    width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.reserveBuffer(totalBufSize);
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-    CostType* Sbuf = Cbuf + CSBufSize;
-    CostType* hsumBuf = Sbuf + CSBufSize;
-    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
-
-    CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes);
-    DispType* disp2ptr = (DispType*)(disp2cost + width);
-    PixType* tempBuf = (PixType*)(disp2ptr + width);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for(k = 0; k < (int)CSBufSize; k++ )
-        Cbuf[k] = (CostType)P2;
+    BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
+    mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
 
     for( int pass = 1; pass <= npasses; pass++ )
     {
@@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             x1 = width1-1; x2 = -1; dx = -1;
         }
 
-        CostType *Lr[2]={0}, *minLr[2]={0};
-
-        for( k = 0; k < 2; k++ )
-        {
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by SIMD vector size
-            Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra;
-            memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2;
-            memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) );
-        }
+        uchar lrID = 0;
+        mem.clearLr();
 
         for( int y = y1; y != y2; y += dy )
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
-            CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
+            CostType* const C = mem.getCBuf(y);
+            CostType* const S = mem.getSBuf(y);
 
             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
             {
@@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                 for( k = dy1; k <= dy2; k++ )
                 {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+                    CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
 
                     if( k < height )
                     {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+                        calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
 
                         memset(hsumAdd, 0, Da*sizeof(CostType));
 #if CV_SIMD
                         v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
                         for( d = 0; d < Da; d += v_int16::nlanes )
                         {
-                            v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale;
+                            v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
                             for( x = Da; x <= SW2*Da; x += Da )
-                                v_hsumAdd += vx_load_aligned(pixDiff + x + d);
+                                v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
                             v_store_aligned(hsumAdd + d, v_hsumAdd);
                         }
 #else
                         for (d = 0; d < D; d++)
                         {
-                            hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1));
+                            hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
                             for( x = Da; x <= SW2*Da; x += Da )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]);
+                                hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
                         }
 #endif
 
                         if( y > 0 )
                         {
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                            const CostType* Cprev =  mem.getCBuf(y - 1);
 
 #if CV_SIMD
                             for (d = 0; d < Da; d += v_int16::nlanes)
@@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                             for( x = Da; x < width1*Da; x += Da )
                             {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 #if CV_SIMD
                                 for( d = 0; d < Da; d += v_int16::nlanes )
                                 {
@@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 #endif
                             for( x = Da; x < width1*Da; x += Da )
                             {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 
 #if CV_SIMD
                                 for (d = 0; d < Da; d += v_int16::nlanes)
@@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     {
                         if( y > 0 )
                         {
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                            const CostType* Cprev = mem.getCBuf(y - 1);
 #if CV_SIMD
                             for (x = 0; x < width1*Da; x += v_int16::nlanes)
                                 v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
@@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 }
 
                 // also, clear the S buffer
-                memset(S, 0, width1*Da * sizeof(CostType));
+                mem.clearSBuf(y);
             }
 
             /*
@@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
             for( x = x1; x != x2; x += dx )
             {
-                int xm = x*NR2, xd = xm*Dlra;
-
-                int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
-                int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
-
-                CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra;
-                CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra;
-                CostType* Lr_p2 = Lr[1] + xd + Dlra*2;
-                CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3;
-
-                Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
-                Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
-
-                CostType* Lr_p = Lr[0] + xd;
+                int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
+                int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
+                int delta2 = P2 + *mem.getMinLr(1 - lrID, x,     2);
+                int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
+
+                CostType* Lr_p0 = mem.getLr(lrID, x - dx);
+                CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
+                CostType* Lr_p2 = mem.getLr(1 - lrID, x,     2);
+                CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
+
+                Lr_p0[-1] = Lr_p0[D] = MAX_COST;
+                Lr_p1[-1] = Lr_p1[D] = MAX_COST;
+                Lr_p2[-1] = Lr_p2[D] = MAX_COST;
+                Lr_p3[-1] = Lr_p3[D] = MAX_COST;
+
+                CostType* Lr_p = mem.getLr(lrID, x);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
 
-                CostType* minL = minLr[0] + xm;
+                CostType* minL = mem.getMinLr(lrID, x);
                 d = 0;
 #if CV_SIMD
                 v_int16 _P1 = vx_setall_s16((short)P1);
@@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
                 {
                     v_store(disp1ptr + x, v_inv_dist);
-                    v_store(disp2ptr + x, v_inv_dist);
-                    v_store(disp2cost + x, v_max_cost);
+                    v_store(mem.disp2ptr + x, v_inv_dist);
+                    v_store(mem.disp2cost + x, v_max_cost);
                 }
 #endif
                 for( ; x < width; x++ )
                 {
-                    disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
-                    disp2cost[x] = MAX_COST;
+                    disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+                    mem.disp2cost[x] = MAX_COST;
                 }
 
                 for( x = width1 - 1; x >= 0; x-- )
@@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                     if( npasses == 1 )
                     {
-                        int xm = x*NR2, xd = xm*Dlra;
-
-                        CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra;
+                        CostType* Lr_p0 = mem.getLr(lrID, x + 1);
                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
-                        CostType* Lr_p = Lr[0] + xd;
+                        CostType* Lr_p = mem.getLr(lrID, x);
 
                         const CostType* Cp = C + x*Da;
 
                         d = 0;
-                        int delta0 = minLr[0][xm + NR2] + P2;
+                        int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
                         int minL0 = MAX_COST;
 #if CV_SIMD
                         v_int16 _P1 = vx_setall_s16((short)P1);
@@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                 bestDisp = (short)d;
                             }
                         }
-                        minLr[0][xm] = (CostType)minL0;
+                        *mem.getMinLr(lrID, x) = (CostType)minL0;
                     }
                     else
                     {
@@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         continue;
                     d = bestDisp;
                     int _x2 = x + minX1 - d - minD;
-                    if( disp2cost[_x2] > minS )
+                    if( mem.disp2cost[_x2] > minS )
                     {
-                        disp2cost[_x2] = (CostType)minS;
-                        disp2ptr[_x2] = (DispType)(d + minD);
+                        mem.disp2cost[_x2] = (CostType)minS;
+                        mem.disp2ptr[_x2] = (DispType)(d + minD);
                     }
 
                     if( 0 < d && d < D-1 )
@@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     int _d = d1 >> DISP_SHIFT;
                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
                     int _x = x - _d, x_ = x - d_;
-                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
-                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+                    if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
+                       0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
                 }
             }
 
-            // now shift the cyclic buffers
-            std::swap( Lr[0], Lr[1] );
-            std::swap( minLr[0], minLr[1] );
+            lrID = 1 - lrID; // now shift the cyclic buffers
         }
     }
 }
@@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 ////////////////////////////////////////////////////////////////////////////////////////////
 struct CalcVerticalSums: public ParallelLoopBody
 {
-    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
-                     CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
+    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
+        : img1(_img1), img2(_img2), mem(mem_)
     {
         minD = params.minDisparity;
         maxD = minD + params.numDisparities;
-        SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
-        ftzero = std::max(params.preFilterCap, 15) | 1;
+        SW2 = SH2 = params.calcSADWindowSize().height/2;
         P1 = params.P1 > 0 ? params.P1 : 2;
         P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
         height = img1.rows;
@@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody
         Da = (int)alignSize(D, v_int16::nlanes);
         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        costBufSize = width1*Da;
-        CSBufSize = costBufSize*height;
-        minLrSize = width1;
-        LrSize = minLrSize*Dlra;
-        hsumBufNRows = SH2*2 + 2;
-        Cbuf = alignedBuf;
-        Sbuf = Cbuf + CSBufSize;
-        hsumBuf = Sbuf + CSBufSize;
+        D = params.numDisparities;
+        Da = (int)alignSize(D, v_int16::nlanes);
     }
 
     void operator()(const Range& range) const CV_OVERRIDE
     {
-        static const CostType MAX_COST = SHRT_MAX;
-        static const int TAB_OFS = 256*4;
-        static const int npasses = 2;
-        int x1 = range.start, x2 = range.end, k;
-        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da;
-        size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size
-                             width*(4*img1.channels()+2)*sizeof(PixType);   //tempBuf
-        Mat auxBuff;
-        auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH);
-        PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
+        const CostType MAX_COST = SHRT_MAX;
+        const int npasses = 2;
+        const int x1 = range.start, x2 = range.end;
+        int k;
+
+        CostType* pixDiff = 0;
+        PixType* tempBuf = 0;
+        utils::BufferArea aux_area;
+        aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
+        aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
+        aux_area.commit();
 
         // Simplification of index calculation
-        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da;
+        if (x1 > SW2)
+            pixDiff -= (x1 - SW2) * Da;
 
         for( int pass = 1; pass <= npasses; pass++ )
         {
@@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody
                 y1 = height-1; y2 = -1; dy = -1;
             }
 
-            CostType *Lr[2]={0}, *minLr[2]={0};
-
-            for( k = 0; k < 2; k++ )
-            {
-                // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-                // and will occasionally use negative indices with the arrays
-                // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-                // however, then the alignment will be imperfect, i.e. bad for SSE,
-                // thus we shift the pointers by SIMD vector size
-                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k;
-                memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) );
-                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k;
-                memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
-            }
+            uchar lrID = 0;
+            mem.clearLr(range);
 
             for( int y = y1; y != y2; y += dy )
             {
                 int x, d;
-                CostType* C = Cbuf + y*costBufSize;
-                CostType* S = Sbuf + y*costBufSize;
+                CostType* C = mem.getCBuf(y);
+                CostType* S = mem.getSBuf(y);
 
                 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
                 {
@@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                     for( k = dy1; k <= dy2; k++ )
                     {
-                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+                        CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
 
                         if( k < height )
                         {
-                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
+                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
 
                             memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
                             for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
@@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                             if( y > 0 )
                             {
-                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                                const CostType* Cprev = C - costBufSize;
+                                const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                                const CostType* Cprev = mem.getCBuf(y - 1);
 #if CV_SIMD
                                 for( d = 0; d < Da; d += v_int16::nlanes )
                                     v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
@@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody
                         {
 /*                            if (y > 0)
                             {
-                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                                const CostType* Cprev = C - costBufSize;
+                                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                                const CostType* Cprev = mem.getCBuf(y - 1);
 
 #if CV_SIMD
                                 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
@@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody
                             }
                         }
                     }
-
-                    // also, clear the S buffer
-                    memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType));
+                    mem.clearSBuf(y, range);
                 }
 
 //              [formula 13 in the paper]
@@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                 for( x = x1; x != x2; x++ )
                 {
-                    int xd = x*Dlra;
-
-                    int delta = minLr[1][x] + P2;
-
-                    CostType* Lr_ppr = Lr[1] + xd;
+                    int delta = P2 + *mem.getMinLr(1 - lrID, x);
+                    CostType* Lr_ppr = mem.getLr(1 - lrID, x);
 
                     Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
 
-                    CostType* Lr_p = Lr[0] + xd;
+                    CostType* Lr_p = mem.getLr(lrID, x);
                     const CostType* Cp = C + x*Da;
                     CostType* Sp = S + x*Da;
 
-                    CostType& minL = minLr[0][x];
+                    CostType& minL = *(mem.getMinLr(lrID, x));
                     d = 0;
 #if CV_SIMD
                     v_int16 _P1 = vx_setall_s16((short)P1);
@@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
                 }
-
-                // now shift the cyclic buffers
-                std::swap( Lr[0], Lr[1] );
-                std::swap( minLr[0], minLr[1] );
+                lrID = 1 - lrID; // now shift the cyclic buffers
             }
         }
     }
     const Mat& img1;
     const Mat& img2;
-    CostType* Cbuf;
-    CostType* Sbuf;
-    CostType* hsumBuf;
-    PixType* clipTab;
+    const BufferSGBM & mem;
     int minD;
     int maxD;
     int D, Da, Dlra;
@@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody
     int height;
     int P1;
     int P2;
-    size_t costBufSize;
-    size_t CSBufSize;
-    size_t minLrSize;
-    size_t LrSize;
-    size_t hsumBufNRows;
-    int ftzero;
 };
 
 struct CalcHorizontalSums: public ParallelLoopBody
 {
-    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
-                     CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
+    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
+        : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
     {
         minD = params.minDisparity;
         maxD = minD + params.numDisparities;
@@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
         Da = (int)alignSize(D, v_int16::nlanes);
         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        costBufSize = width1*Da;
-        CSBufSize = costBufSize*height;
-        LrSize = 2 * Dlra;
-        Cbuf = alignedBuf;
-        Sbuf = Cbuf + CSBufSize;
     }
 
     void operator()(const Range& range) const CV_OVERRIDE
     {
         int y1 = range.start, y2 = range.end;
-        size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType));
 
-        Mat auxBuff;
-        auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes;
-        CostType* disp2cost = Lr + LrSize;
-        DispType* disp2ptr = (DispType*)(disp2cost + width);
+        const size_t LrSize = 2 * (1 + Dlra + 1);
+
+        CostType * Lr = 0;
+        CostType * disp2cost = 0;
+        DispType * disp2ptr = 0;
+        utils::BufferArea aux_area;
+        aux_area.allocate(Lr, LrSize);
+        aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
+        aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
+        aux_area.commit();
 
         CostType minLr;
 
@@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + y*costBufSize;
-            CostType* S = Sbuf + y*costBufSize;
+            CostType* C = mem.getCBuf(y);
+            CostType* S = mem.getSBuf(y);
 
             x = 0;
 #if CV_SIMD
@@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
             }
 
             // clear buffers
-            memset( Lr, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
+            aux_area.zeroFill(Lr);
+            Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
 
             minLr = 0;
 //          [formula 13 in the paper]
@@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
             for( x = 0; x != width1; x++)
             {
                 int delta = minLr + P2;
-
-                CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra);
-
-                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
+                CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
 
@@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
-                    v_store_aligned(Lr_p + d, L);
+                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
                     v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
                 }
@@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 }
             }
 
-            memset( Lr, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
+            aux_area.zeroFill(Lr);
+            Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
 
             minLr = 0;
 
             for( x = width1-1; x != -1; x--)
             {
                 int delta = minLr + P2;
-
-                CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra);
-
-                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
+                CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
                 CostType minS = MAX_COST;
@@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
-                    v_store_aligned(Lr_p + d, L);
+                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
                     L += vx_load_aligned(Sp + d);
                     v_store_aligned(Sp + d, L);
@@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
     const Mat& img1;
     const Mat& img2;
     Mat& disp1;
-    CostType* Cbuf;
-    CostType* Sbuf;
+    const BufferSGBM & mem;
     int minD;
     int maxD;
     int D, Da, Dlra;
@@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody
     int P2;
     int minX1;
     int maxX1;
-    size_t costBufSize;
-    size_t CSBufSize;
-    size_t LrSize;
     int INVALID_DISP;
     int INVALID_DISP_SCALED;
     int uniquenessRatio;
@@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody
  is written as is, without interpolation.
  */
 static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoSGBMParams& params,
-                                 Mat& buffer )
+                                 Mat& disp1, const StereoSGBMParams& params)
 {
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
     Size SADWindowSize;
     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
-    int k, width = disp1.cols, height = disp1.rows;
+    int width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
-    int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
-    int SH2 = SADWindowSize.height/2;
+    int width1 = maxX1 - minX1;
+    int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
+    int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1;
     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     if( minX1 >= maxX1 )
     {
@@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
         return;
     }
 
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*height;
-    size_t minLrSize = width1 , LrSize = minLrSize*Dlra;
-    int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S
-                          costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf
-                          ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[]
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-    {
-        buffer.reserveBuffer(totalBufSize);
-    }
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for(k = 0; k < (int)CSBufSize; k++ )
-        Cbuf[k] = (CostType)P2;
-
-    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
-    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
+    BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
+    mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
 
+    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
+    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
-                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
-                       PixType*& tmpBuf, CostType*& horPassCostVolume,
-                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
-                       CostType*& disp2CostBuf, short*& disp2Buf);
+class BufferSGBM3Way
+{
+private:
+    size_t hsumCols;
+    size_t hsumRows;
+public:
+    CostType *curCostVolumeLine;
+    CostType *hsumBuf;
+    CostType *pixDiff;
+    PixType *tmpBuf;
+    CostType *horPassCostVolume;
+    CostType *vertPassCostVolume;
+    CostType *vertPassMin;
+    CostType *rightPassBuf;
+    CostType *disp2CostBuf;
+    short *disp2Buf;
+private:
+    utils::BufferArea area;
+public:
+    BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
+        curCostVolumeLine(0),
+        hsumBuf(0),
+        pixDiff(0),
+        tmpBuf(0),
+        horPassCostVolume(0),
+        vertPassCostVolume(0),
+        vertPassMin(0),
+        rightPassBuf(0),
+        disp2CostBuf(0),
+        disp2Buf(0)
+    {
+        hsumCols = width1 * Da;
+        hsumRows = SH2*2 + 2;
+        area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
+        area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
+        area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
+        area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
+        area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
+        area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
+        area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
+        area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
+        area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
+        area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
+        area.commit();
+        area.zeroFill();
+        for(size_t i = 0; i < hsumCols; i++)
+            curCostVolumeLine[i] = (CostType)P2;
+    }
+    inline void clearRightPassBuf()
+    {
+        area.zeroFill(rightPassBuf);
+    }
+    CostType *getHSumBuf(int x) const
+    {
+        return hsumBuf + (x % hsumRows) * hsumCols;
+    }
+};
 
 struct SGBM3WayMainLoop : public ParallelLoopBody
 {
-    Mat* buffers;
     const Mat *img1, *img2;
     Mat* dst_disp;
 
-    int nstripes, stripe_sz;
+    int stripe_sz;
     int stripe_overlap;
 
     int width,height;
@@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
     int P1, P2;
     int uniquenessRatio, disp12MaxDiff;
 
-    int costBufSize, hsumBufNRows;
-    int TAB_OFS, ftzero;
+    int TAB_OFS;
 
+    utils::BufferArea aux_area;
     PixType* clipTab;
 #if CV_SIMD
     short idx_row[v_int16::nlanes];
 #endif
-    SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
-    void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
+    SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
     void operator () (const Range& range) const CV_OVERRIDE;
     template<bool x_nlanes> void impl(const Range& range) const;
+
+private:
+    void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
+
+    template<bool x_nlanes>
+    void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
+                                int x,
+                                CostType &leftMinCost) const;
+
+    template<bool x_nlanes>
+    void accumulateCostsRight(const BufferSGBM3Way &mem,
+                              int x,
+                              CostType &rightMinCost,
+                              short &optimal_disp,
+                              CostType &min_cost) const;
 };
 
-SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
-buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
+SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
+                                   const Mat& _img2,
+                                   Mat* _dst_disp,
+                                   const StereoSGBMParams& params,
+                                   int _stripe_sz,
+                                   int _stripe_overlap)
+    : img1(&_img1),
+    img2(&_img2),
+    dst_disp(_dst_disp),
+    stripe_sz(_stripe_sz),
+    stripe_overlap(_stripe_overlap),
+    clipTab(0)
 {
-    nstripes = _nstripes;
-    stripe_overlap = _stripe_overlap;
-    stripe_sz = (int)ceil(img1->rows/(double)nstripes);
+    // precompute a lookup table for the raw matching cost computation:
+    TAB_OFS = 256*4;
+    const int TAB_SIZE = 256 + TAB_OFS*2;
+    aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
+    aux_area.commit();
+    const int ftzero = std::max(params.preFilterCap, 15) | 1;
+    for(int k = 0; k < TAB_SIZE; k++ )
+        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     width = img1->cols; height = img1->rows;
     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
@@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 
-    costBufSize = width1*Da;
-    hsumBufNRows = SH2*2 + 2;
-    TAB_OFS = 256*4;
-    ftzero = std::max(params.preFilterCap, 15) | 1;
 #if CV_SIMD
     for(short i = 0; i < v_int16::nlanes; ++i)
         idx_row[i] = i;
 #endif
 }
 
-void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
-                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
-                       PixType*& tmpBuf, CostType*& horPassCostVolume,
-                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
-                       CostType*& disp2CostBuf, short*& disp2Buf)
-{
-    // allocating all the required memory:
-    int costVolumeLineSize = width1*Da;
-    int width1_ext = width1+2;
-    int costVolumeLineSize_ext = width1_ext*Da;
-    int hsumBufNRows = SH2*2 + 2;
-
-    // main buffer to store matching costs for the current line:
-    int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
-
-    // auxiliary buffers for the raw matching cost computation:
-    int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
-    int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
-    int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType);
-
-    // auxiliary buffers for the matching cost aggregation:
-    int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
-    int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
-    int rightPassBufSize = Da * sizeof(CostType);                     // additional small buffer for the right-to-left pass
-    int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
-
-    // buffers for the pseudo-LRC check:
-    int disp2CostBufSize = width*sizeof(CostType);
-    int disp2BufSize     = width*sizeof(short);
-
-    // sum up the sizes of all the buffers:
-    size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize +
-                          hsumBufSize +
-                          pixDiffSize +
-                          horPassCostVolumeSize +
-                          vertPassCostVolumeSize +
-                          rightPassBufSize +
-                          vertPassMinSize +
-                          disp2CostBufSize +
-                          disp2BufSize +
-                          tmpBufSize;
-
-    if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.reserveBuffer(totalBufSize);
-
-    // set up all the pointers:
-    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-    hsumBuf            = curCostVolumeLine + costVolumeLineSize;
-    pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
-    horPassCostVolume  = pixDiff + costVolumeLineSize;
-    vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
-    rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
-    vertPassMin        = rightPassBuf + Da;
-
-    disp2CostBuf       = vertPassMin + width1_ext;
-    disp2Buf           = disp2CostBuf + width;
-    tmpBuf = (PixType*)(disp2Buf + width);
-
-    // initialize memory:
-    memset(buffer.ptr(),0,totalBufSize);
-    int i = 0;
-#if CV_SIMD
-    v_int16 _P2 = vx_setall_s16((CostType)P2);
-    for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes)
-        v_store_aligned(curCostVolumeLine + i, _P2);
-#endif
-    for(;i<costVolumeLineSize;i++)
-        curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
-}
-
 // performing block matching and building raw cost-volume for the current row
-void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
-                                          CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
-                                          int y, int src_start_idx) const
+void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
 {
+    CostType* C = mem.curCostVolumeLine;
+    CostType* pixDiff = mem.pixDiff;
+    PixType* tmpBuf = mem.tmpBuf;
     int x, d;
     int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
 
     for(int k = dy1; k <= dy2; k++ )
     {
-        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+        CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
         if( k < height )
         {
-            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
+            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
 
 #if CV_SIMD
             v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
@@ -1634,7 +1680,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
 #endif
             if( y > src_start_idx )
             {
-                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 
 #if CV_SIMD
                 for (d = 0; d < Da; d += v_int16::nlanes)
@@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
         {
             if( y > src_start_idx )
             {
-                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 #if CV_SIMD
                 for( x = 0; x < width1*Da; x += v_int16::nlanes)
                     v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
@@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
 // in-place cost accumulation from top to bottom (result is stored in topBuf)
 template<bool x_nlanes>
-inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
-                                   CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
+void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
 {
+    CostType *leftBuf = mem.horPassCostVolume + x;
+    CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
+    CostType *topBuf = mem.vertPassCostVolume + x;
+    CostType *costs = mem.curCostVolumeLine - Da + x;
+    CostType& topMinCost = mem.vertPassMin[x/Da];
     int i = 0;
 #if CV_SIMD
-    int Da = (int)alignSize(D, v_int16::nlanes);
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
 // optimal disparity value with minimum accumulated cost
 template<bool x_nlanes>
-inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
-                                 CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost)
+void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
+                                            CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
 {
+    CostType* costs = mem.curCostVolumeLine - Da + x;
+    CostType* rightBuf = mem.rightPassBuf;
+    CostType* topBuf = mem.vertPassCostVolume + x;
+    CostType* leftBuf = mem.horPassCostVolume + x;
+
     int i = 0;
 #if CV_SIMD
-    int Da = (int)alignSize(D, v_int16::nlanes);
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
     if (D == Da) impl<true>(range);
     else impl<false>(range);
 }
+
 template<bool x_nlanes>
 void SGBM3WayMainLoop::impl(const Range& range) const
 {
@@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const
     else
         dst_offset=0;
 
-    Mat cur_buffer = buffers [range.start];
     Mat cur_disp   = dst_disp[range.start];
     cur_disp = Scalar(INVALID_DISP_SCALED);
 
-    // prepare buffers:
-    CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
-    PixType* tmpBuf;
-    CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
-    short* disp2Buf;
-    getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2,
-                      curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
-                      vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
-
+    BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
+    CostType *horPassCostVolume = mem.horPassCostVolume;
     // start real processing:
     for(int y=src_start_idx;y<src_end_idx;y++)
     {
-        getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
+        getRawMatchingCost(mem, y, src_start_idx);
 
         short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
 
         // initialize the auxiliary buffers for the pseudo left-right consistency check:
         for(int x=0;x<width;x++)
         {
-            disp2Buf[x] = (short)INVALID_DISP_SCALED;
-            disp2CostBuf[x] = SHRT_MAX;
+            mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
+            mem.disp2CostBuf[x] = SHRT_MAX;
         }
-        CostType* C = curCostVolumeLine - Da;
         CostType prev_min, min_cost;
         int d;
         short best_d;
@@ -2014,14 +2059,14 @@ void SGBM3WayMainLoop::impl(const Range& range) const
         // forward pass
         prev_min=0;
         for (int x=Da;x<(1+width1)*Da;x+=Da)
-            accumulateCostsLeftTop<x_nlanes>(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2);
+            accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
 
         //backward pass
-        memset(rightPassBuf,0,Da*sizeof(CostType));
+        mem.clearRightPassBuf();
         prev_min=0;
         for (int x=width1*Da;x>=Da;x-=Da)
         {
-            accumulateCostsRight<x_nlanes>(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
+            accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
 
             if(uniquenessRatio>0)
             {
@@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             d = best_d;
 
             int _x2 = x/Da - 1 + minX1 - d - minD;
-            if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
+            if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
             {
-                disp2CostBuf[_x2] = min_cost;
-                disp2Buf[_x2] = (short)(d + minD);
+                mem.disp2CostBuf[_x2] = min_cost;
+                mem.disp2Buf[_x2] = (short)(d + minD);
             }
 
             if( 0 < d && d < D-1 )
@@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             int _d = d1 >> StereoMatcher::DISP_SHIFT;
             int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
             int _x = x - _d, x_ = x - d_;
-            if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
-                0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
+            if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
+                0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
                 disp_row[x] = (short)INVALID_DISP_SCALED;
         }
     }
 }
 
-static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
-                                      Mat& disp1, const StereoSGBMParams& params,
-                                      Mat* buffers, int nstripes )
+template <uchar nstripes>
+static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
 {
-    // precompute a lookup table for the raw matching cost computation:
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType* clipTab = new PixType[TAB_SIZE];
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
-    for(int k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
-
     // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
     int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
     int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
-    Mat* dst_disp = new Mat[nstripes];
+    Mat dst_disp[nstripes];
     for(int i=0;i<nstripes;i++)
         dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
 
-    parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
+    parallel_for_(
+        Range(0,nstripes),
+        SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
+    );
 
     //assemble disp1 from dst_disp:
     short* dst_row;
@@ -2140,9 +2180,6 @@ static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
         src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
         memcpy(dst_row,src_row,disp1.cols*sizeof(short));
     }
-
-    delete[] clipTab;
-    delete[] dst_disp;
 }
 
 class StereoSGBMImpl CV_FINAL : public StereoSGBM
@@ -2176,11 +2213,13 @@ public:
         Mat disp = disparr.getMat();
 
         if(params.mode==MODE_SGBM_3WAY)
-            computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
+            // the number of stripes is fixed, disregarding the number of threads/processors
+            // to make the results fully reproducible
+            computeDisparity3WaySGBM<4>( left, right, disp, params );
         else if(params.mode==MODE_HH4)
-            computeDisparitySGBM_HH4( left, right, disp, params, buffer );
+            computeDisparitySGBM_HH4( left, right, disp, params );
         else
-            computeDisparitySGBM( left, right, disp, params, buffer );
+            computeDisparitySGBM( left, right, disp, params );
 
         medianBlur(disp, disp, 3);
 
@@ -2259,11 +2298,6 @@ public:
     StereoSGBMParams params;
     Mat buffer;
 
-    // the number of stripes is fixed, disregarding the number of threads/processors
-    // to make the results fully reproducible:
-    static const int num_stripes = 4;
-    Mat buffers[num_stripes];
-
     static const char* name_;
 };
 
diff --git a/modules/core/include/opencv2/core/check.hpp b/modules/core/include/opencv2/core/check.hpp
index 604447e8d7..0e0c7cbf31 100644
--- a/modules/core/include/opencv2/core/check.hpp
+++ b/modules/core/include/opencv2/core/check.hpp
@@ -79,6 +79,7 @@ CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext
 CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const std::string& v1, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index 3fa9027c04..e189582daa 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -1553,13 +1553,13 @@ inline v_float64x8 v_cvt_f64(const v_int64x8& v)
     return v_float64x8(_mm512_cvtepi64_pd(v.val));
 #else
     // constants encoded as floating-point
-    __m512i magic_i_lo   = _mm512_set1_epi64x(0x4330000000000000); // 2^52
-    __m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
-    __m512i magic_i_all  = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
     __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
 
     // Blend the 32 lowest significant bits of v with magic_int_lo
-    __m512i v_lo         = _mm512_blend_epi32(magic_i_lo, v.val, 0x55);
+    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
     // Extract the 32 most significant bits of v
     __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
     // Flip the msb of v_hi and blend with 0x45300000
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index de34da45a4..6a22b1788e 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -1269,6 +1269,34 @@ Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
     return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
 }
 
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
 template<typename _Tp, int m, int n> static inline
 Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
 {
diff --git a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
index 141ad2c502..ab19da6416 100644
--- a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
+++ b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
@@ -74,6 +74,25 @@ public:
         allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment);
     }
 
+    /** @brief Fill one of buffers with zeroes
+
+    @param ptr pointer to memory block previously added using BufferArea::allocate
+
+    BufferArea::commit must be called before using this method
+    */
+    template <typename T>
+    void zeroFill(T*&ptr)
+    {
+        CV_Assert(ptr);
+        zeroFill_((void**)&ptr);
+    }
+
+    /** @brief Fill all buffers with zeroes
+
+    BufferArea::commit must be called before using this method
+    */
+    void zeroFill();
+
     /** @brief Allocate memory and initialize all bound pointers
 
     Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set
@@ -83,10 +102,18 @@ public:
     */
     void commit();
 
+    /** @brief Release all memory and unbind all pointers
+
+    All memory will be freed and all pointers will be reset to NULL and untied from the area allowing
+    to call `allocate` and `commit` again.
+    */
+    void release();
+
 private:
     BufferArea(const BufferArea &); // = delete
     BufferArea &operator=(const BufferArea &); // = delete
     void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment);
+    void zeroFill_(void **ptr);
 
 private:
     class Block;
diff --git a/modules/core/src/buffer_area.cpp b/modules/core/src/buffer_area.cpp
index 2a41c72f45..b6bb321bba 100644
--- a/modules/core/src/buffer_area.cpp
+++ b/modules/core/src/buffer_area.cpp
@@ -66,6 +66,16 @@ public:
         *ptr = buf;
         return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count);
     }
+    bool operator==(void **other) const
+    {
+        CV_Assert(ptr && other);
+        return *ptr == *other;
+    }
+    void zeroFill() const
+    {
+        CV_Assert(ptr && *ptr);
+        memset(static_cast<uchar*>(*ptr), 0, count * type_size);
+    }
 private:
     void **ptr;
     void * raw_mem;
@@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) :
 
 BufferArea::~BufferArea()
 {
-    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
-        i->cleanup();
-    if (oneBuf)
-        fastFree(oneBuf);
+    release();
 }
 
 void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment)
@@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al
         totalSize += blocks.back().getByteCount();
 }
 
+void BufferArea::zeroFill_(void **ptr)
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        if (*i == ptr)
+        {
+            i->zeroFill();
+            break;
+        }
+    }
+}
+
+void BufferArea::zeroFill()
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        i->zeroFill();
+    }
+}
+
 void BufferArea::commit()
 {
     if (!safe)
@@ -116,6 +143,20 @@ void BufferArea::commit()
     }
 }
 
+void BufferArea::release()
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        i->cleanup();
+    }
+    blocks.clear();
+    if (oneBuf)
+    {
+        fastFree(oneBuf);
+        oneBuf = 0;
+    }
+}
+
 //==================================================================================================
 
 }} // cv::utils::
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index 26efbb7541..90df841e63 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -171,6 +171,10 @@ void check_failed_auto(const Size_<int> v, const CheckContext& ctx)
 {
     check_failed_auto_< Size_<int> >(v, ctx);
 }
+void check_failed_auto(const std::string& v, const CheckContext& ctx)
+{
+    check_failed_auto_< std::string >(v, ctx);
+}
 
 
 }} // namespace
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 1f981ee871..48440ef265 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -916,9 +916,9 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
     kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
 
     ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
+        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
                 kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
-                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), cn, pxPerWIy, kercn));
+                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/matrix_expressions.cpp b/modules/core/src/matrix_expressions.cpp
index ea431336f0..5ac1fafbd6 100644
--- a/modules/core/src/matrix_expressions.cpp
+++ b/modules/core/src/matrix_expressions.cpp
@@ -1257,7 +1257,7 @@ int MatExpr::type() const
     if( isInitializer(*this) )
         return a.type();
     if( isCmp(*this) )
-        return CV_8U;
+        return CV_MAKETYPE(CV_8U, a.channels());
     return op ? op->type(*this) : -1;
 }
 
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index de9be8514f..b129f243f2 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -570,6 +570,7 @@ int _InputArray::sizend(int* arrsz, int i) const
     }
     else
     {
+        CV_CheckLE(dims(i), 2, "Not supported");  // TODO Support EXPR with 3+ dims
         Size sz2d = size(i);
         d = 2;
         if(arrsz)
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 5f89130ef9..0025e37095 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4702,6 +4702,8 @@ public:
             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
 
+            bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
+
             cl_context ctx_handle = (cl_context)ctx.ptr();
             int allocatorFlags = 0;
             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
@@ -4761,8 +4763,15 @@ public:
             else
 #endif
             {
+                if( copyOnMap )
+                    accessFlags &= ~ACCESS_FAST;
+
                 tempUMatFlags = UMatData::TEMP_UMAT;
-                if (CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
+                if (
+                #ifdef __APPLE__
+                    !copyOnMap &&
+                #endif
+                    CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
                     // There are OpenCL runtime issues for less aligned data
                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
@@ -4790,7 +4799,7 @@ public:
             u->handle = handle;
             u->prevAllocator = u->currAllocator;
             u->currAllocator = this;
-            u->flags |= tempUMatFlags;
+            u->flags |= tempUMatFlags | flags0;
             u->allocatorFlags_ = allocatorFlags;
         }
         if (!!(accessFlags & ACCESS_WRITE))
diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl
index bd670a5b72..afd14e4e1f 100644
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@@ -42,10 +42,25 @@
 #if kercn != 3
 #define loadpix(addr) *(__global const T *)(addr)
 #define storepix(val, addr)  *(__global T *)(addr) = val
+#define storepix_2(val0, val1, addr0, addr1) \
+    *(__global T *)(addr0) = val0; *(__global T *)(addr1) = val1
 #define TSIZE (int)sizeof(T)
 #else
 #define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#if DEPTH == 2 || DEPTH == 3
+#define storepix_2(val0, val1, addr0, addr1) \
+    ((__global T1 *)(addr0))[0] = val0.x; \
+    ((__global T1 *)(addr1))[0] = val1.x; \
+    ((__global T1 *)(addr0))[1] = val0.y; \
+    ((__global T1 *)(addr1))[1] = val1.y; \
+    ((__global T1 *)(addr0))[2] = val0.z; \
+    ((__global T1 *)(addr1))[2] = val1.z
+#else
+#define storepix_2(val0, val1, addr0, addr1) \
+    storepix(val0, addr0); \
+    storepix(val1, addr1)
+#endif
 #define TSIZE ((int)sizeof(T1)*3)
 #endif
 
@@ -69,8 +84,7 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int
             T src0 = loadpix(srcptr + src_index0);
             T src1 = loadpix(srcptr + src_index1);
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -115,8 +129,7 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step,
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -161,8 +174,7 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 += src_step;
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 82c0fb88c5..db2a6cae88 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -58,13 +58,20 @@
     #include <unistd.h>
     #include <stdio.h>
     #include <sys/types.h>
+    #include <fstream>
     #if defined __ANDROID__
         #include <sys/sysconf.h>
+        #include <sys/syscall.h>
+        #include <sched.h>
     #elif defined __APPLE__
         #include <sys/sysctl.h>
     #endif
 #endif
 
+#if defined CV_CXX11
+    #include <thread>
+#endif
+
 #ifdef _OPENMP
     #define HAVE_OPENMP
 #endif
@@ -758,19 +765,40 @@ int cv::getThreadNum(void)
 #endif
 }
 
-#ifdef __ANDROID__
-static inline int getNumberOfCPUsImpl()
+
+#if defined __linux__ || defined __GLIBC__ || defined __HAIKU__ || defined __ANDROID__
+  #define CV_CPU_GROUPS_1
+#endif
+
+#if defined __linux__ || defined __ANDROID__
+  #define CV_HAVE_CGROUPS 1
+#endif
+
+#if defined CV_CPU_GROUPS_1
+static inline
+std::string getFileContents(const char *filename)
 {
-   FILE* cpuPossible = fopen("/sys/devices/system/cpu/possible", "r");
-   if(!cpuPossible)
-       return 1;
+    std::ifstream ifs(filename);
+    if (!ifs.is_open())
+        return std::string();
+
+    std::string content( (std::istreambuf_iterator<char>(ifs) ),
+                         (std::istreambuf_iterator<char>()    ) );
+
+    if (ifs.fail())
+        return std::string();
 
-   char buf[2000]; //big enough for 1000 CPUs in worst possible configuration
-   char* pbuf = fgets(buf, sizeof(buf), cpuPossible);
-   fclose(cpuPossible);
-   if(!pbuf)
-      return 1;
+    return content;
+}
+
+static inline
+int getNumberOfCPUsImpl(const char *filename)
+{
+   std::string file_contents = getFileContents(filename);
+   if(file_contents.empty())
+       return 0;
 
+   char *pbuf = const_cast<char*>(file_contents.c_str());
    //parse string of form "0-1,3,5-7,10,13-15"
    int cpusAvailable = 0;
 
@@ -794,27 +822,76 @@ static inline int getNumberOfCPUsImpl()
       }
 
    }
-   return cpusAvailable ? cpusAvailable : 1;
+   return cpusAvailable;
 }
 #endif
 
+#if defined CV_HAVE_CGROUPS
+static inline
+unsigned getNumberOfCPUsCFS()
+{
+    int cfs_quota = 0;
+    {
+        std::ifstream ss_period("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", std::ios::in | std::ios::binary);
+        ss_period >> cfs_quota;
+
+        if (ss_period.fail() || cfs_quota < 1) /* cfs_quota must not be 0 or negative */
+            return 0;
+    }
+
+    int cfs_period = 0;
+    {
+        std::ifstream ss_quota("/sys/fs/cgroup/cpu/cpu.cfs_period_us", std::ios::in | std::ios::binary);
+        ss_quota >> cfs_period;
+
+        if (ss_quota.fail() || cfs_period < 1)
+            return 0;
+    }
+
+    return (unsigned)max(1, cfs_quota/cfs_period);
+}
+#endif
+
+template <typename T> static inline
+T minNonZero(const T& val_1, const T& val_2)
+{
+    if ((val_1 != 0) && (val_2 != 0))
+        return std::min(val_1, val_2);
+    return (val_1 != 0) ? val_1 : val_2;
+}
+
 int cv::getNumberOfCPUs(void)
 {
+    /*
+     * Logic here is to try different methods of getting CPU counts and return
+     * the minimum most value as it has high probablity of being right and safe.
+     * Return 1 if we get 0 or not found on all methods.
+    */
+#if defined CV_CXX11
+    /*
+     * Check for this standard C++11 way, we do not return directly because
+     * running in a docker or K8s environment will mean this is the host
+     * machines config not the containers or pods and as per docs this value
+     * must be "considered only a hint".
+    */
+    unsigned ncpus = std::thread::hardware_concurrency(); /* If the value is not well defined or not computable, returns 0 */
+#else
+    unsigned ncpus = 0; /* 0 means we have to find out some other way */
+#endif
+
 #if defined _WIN32
+
     SYSTEM_INFO sysinfo;
 #if (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_X64) || defined(WINRT)) && _WIN32_WINNT >= 0x501
     GetNativeSystemInfo( &sysinfo );
 #else
     GetSystemInfo( &sysinfo );
 #endif
+    unsigned ncpus_sysinfo = sysinfo.dwNumberOfProcessors < 0 ? 1 : sysinfo.dwNumberOfProcessors; /* Just a fail safe */
+    ncpus = minNonZero(ncpus, ncpus_sysinfo);
 
-    return (int)sysinfo.dwNumberOfProcessors;
-#elif defined __ANDROID__
-    static int ncpus = getNumberOfCPUsImpl();
-    return ncpus;
-#elif defined __linux__ || defined __GLIBC__ || defined __HAIKU__ || defined __EMSCRIPTEN__
-    return (int)sysconf( _SC_NPROCESSORS_ONLN );
 #elif defined __APPLE__
+
     int numCPU=0;
     int mib[4];
     size_t len = sizeof(numCPU);
@@ -835,10 +912,44 @@ int cv::getNumberOfCPUs(void)
             numCPU = 1;
     }
 
-    return (int)numCPU;
-#else
-    return 1;
+    ncpus = minNonZero(ncpus, (unsigned)numCPU);
+
+#elif defined CV_CPU_GROUPS_1
+
+#if defined CV_HAVE_CGROUPS
+    static unsigned ncpus_impl_cpuset = (unsigned)getNumberOfCPUsImpl("/sys/fs/cgroup/cpuset/cpuset.cpus");
+    ncpus = minNonZero(ncpus, ncpus_impl_cpuset);
+
+    static unsigned ncpus_impl_cfs = getNumberOfCPUsCFS();
+    ncpus = minNonZero(ncpus, ncpus_impl_cfs);
+#endif
+
+    static unsigned ncpus_impl_devices = (unsigned)getNumberOfCPUsImpl("/sys/devices/system/cpu/online");
+    ncpus = minNonZero(ncpus, ncpus_impl_devices);
+
 #endif
+
+#if defined _GNU_SOURCE \
+    && !defined(__EMSCRIPTEN__) \
+    && !defined(__ANDROID__)  // TODO: add check for modern Android NDK
+
+    cpu_set_t cpu_set;
+    if (0 == sched_getaffinity(0, sizeof(cpu_set), &cpu_set))
+    {
+        unsigned cpu_count_cpu_set = CPU_COUNT(&cpu_set);
+        ncpus = minNonZero(ncpus, cpu_count_cpu_set);
+    }
+
+#endif
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+    static unsigned cpu_count_sysconf = (unsigned)sysconf( _SC_NPROCESSORS_ONLN );
+    ncpus = minNonZero(ncpus, cpu_count_sysconf);
+
+#endif
+
+    return ncpus != 0 ? ncpus : 1;
 }
 
 const char* cv::currentParallelFramework() {
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index d2328771d9..76d8c5f038 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -2007,6 +2007,17 @@ TEST(Core_MatExpr, issue_13926)
     EXPECT_GE(1e-6, cvtest::norm(M2*M1, M2*M2, NORM_INF)) << Mat(M2*M1) << std::endl << Mat(M2*M2);
 }
 
+TEST(Core_MatExpr, issue_16655)
+{
+    Mat a(Size(5, 5), CV_32FC3, Scalar::all(1));
+    Mat b(Size(5, 5), CV_32FC3, Scalar::all(2));
+    MatExpr ab_expr = a != b;
+    Mat ab_mat = ab_expr;
+    EXPECT_EQ(CV_8UC3, ab_expr.type())
+        << "MatExpr: CV_8UC3 != " << typeToString(ab_expr.type());
+    EXPECT_EQ(CV_8UC3, ab_mat.type())
+        << "Mat: CV_8UC3 != " << typeToString(ab_mat.type());
+}
 
 #ifdef HAVE_EIGEN
 TEST(Core_Eigen, eigen2cv_check_Mat_type)
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index dd91617537..caef417883 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -69,6 +69,8 @@ protected:
     bool TestVec();
     bool TestMatxMultiplication();
     bool TestMatxElementwiseDivison();
+    bool TestDivisionByValue();
+    bool TestInplaceDivisionByValue();
     bool TestMatMatxCastSum();
     bool TestSubMatAccess();
     bool TestExp();
@@ -976,6 +978,50 @@ bool CV_OperationsTest::TestMatxElementwiseDivison()
     return true;
 }
 
+bool CV_OperationsTest::TestDivisionByValue()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        float alpha = 2.f;
+
+        Matx22f res = mat / alpha;
+
+        if(res(0, 0) != 1.0) throw test_excep();
+        if(res(0, 1) != 2.0) throw test_excep();
+        if(res(1, 0) != 3.0) throw test_excep();
+        if(res(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
+
+
+bool CV_OperationsTest::TestInplaceDivisionByValue()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        float alpha = 2.f;
+
+        mat /= alpha;
+
+        if(mat(0, 0) != 1.0) throw test_excep();
+        if(mat(0, 1) != 2.0) throw test_excep();
+        if(mat(1, 0) != 3.0) throw test_excep();
+        if(mat(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
 
 bool CV_OperationsTest::TestVec()
 {
@@ -1204,6 +1250,12 @@ void CV_OperationsTest::run( int /* start_from */)
     if (!TestMatxElementwiseDivison())
         return;
 
+    if (!TestDivisionByValue())
+        return;
+
+    if (!TestInplaceDivisionByValue())
+        return;
+
     if (!TestMatMatxCastSum())
         return;
 
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index 87891488ec..1a23e01fb9 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -337,6 +337,21 @@ TEST_P(BufferArea, basic)
         ASSERT_TRUE(dbl_ptr != NULL);
         EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int));
         EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double));
+        for (size_t i = 0; i < SZ; ++i)
+        {
+            int_ptr[i] = (int)i + 1;
+            uchar_ptr[i] = (uchar)i + 1;
+            dbl_ptr[i] = (double)i + 1;
+        }
+        area.zeroFill(int_ptr);
+        area.zeroFill(uchar_ptr);
+        area.zeroFill(dbl_ptr);
+        for (size_t i = 0; i < SZ; ++i)
+        {
+            EXPECT_EQ((int)0, int_ptr[i]);
+            EXPECT_EQ((uchar)0, uchar_ptr[i]);
+            EXPECT_EQ((double)0, dbl_ptr[i]);
+        }
     }
     EXPECT_TRUE(int_ptr == NULL);
     EXPECT_TRUE(uchar_ptr == NULL);
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 4699dac9ca..a28c98483d 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -494,6 +494,10 @@ CV__DNN_INLINE_NS_BEGIN
          */
         CV_WRAP void setInputsNames(const std::vector<String> &inputBlobNames);
 
+        /** @brief Specify shape of network input.
+         */
+        CV_WRAP void setInputShape(const String &inputName, const MatShape& shape);
+
         /** @brief Runs forward pass to compute output of layer with name @p outputName.
          *  @param outputName name for layer which output is needed to get
          *  @return blob for first output of specified layer.
diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
index 98adcc382d..5b8d953c1a 100644
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -138,6 +138,16 @@ static inline MatShape shape(const UMat& mat)
     return shape(mat.size.p, mat.dims);
 }
 
+#if 0  // issues with MatExpr wrapped into InputArray
+static inline
+MatShape shape(InputArray input)
+{
+    int sz[CV_MAX_DIM];
+    int ndims = input.sizend(sz);
+    return shape(sz, ndims);
+}
+#endif
+
 namespace {inline bool is_neg(int i) { return i < 0; }}
 
 static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 4a5a1f1c0a..8ceee2e587 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -484,10 +484,7 @@ public:
         {
             CV_CheckEQ(inp_shapes.size(), netInputs.size(), "");
             for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++)
-            {
-                if (!inp_shapes[inp_id].empty())
-                    dstNet.setInput(Mat(inp_shapes[inp_id], CV_32F), netInputs[inp_id]);
-            }
+                dstNet.setInputShape(netInputs[inp_id], inp_shapes[inp_id]);
         }
 
         addedBlobs.clear();
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 5e1b125a0c..b93d740109 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -556,7 +556,7 @@ namespace cv {
                             const size_t layer_type_size = line.find("]") - 1;
                             CV_Assert(layer_type_size < line.size());
                             std::string layer_type = line.substr(1, layer_type_size);
-                            net->layers_cfg[layers_counter]["type"] = layer_type;
+                            net->layers_cfg[layers_counter]["layer_type"] = layer_type;
                         }
                         break;
                     default:
@@ -599,7 +599,7 @@ namespace cv {
                 for (it_type i = net->layers_cfg.begin(); i != net->layers_cfg.end(); ++i) {
                     ++layers_counter;
                     std::map<std::string, std::string> &layer_params = i->second;
-                    std::string layer_type = layer_params["type"];
+                    std::string layer_type = layer_params["layer_type"];
 
                     if (layer_type == "convolutional")
                     {
@@ -682,7 +682,7 @@ namespace cv {
                         else
                             setParams.setConcat(layers_vec.size(), layers_vec.data());
                     }
-                    else if (layer_type == "dropout")
+                    else if (layer_type == "dropout" || layer_type == "cost")
                     {
                         setParams.setIdentity(layers_counter-1);
                     }
@@ -806,7 +806,7 @@ namespace cv {
                     ++darknet_layers_counter;
                     ++cv_layers_counter;
                     std::map<std::string, std::string> &layer_params = i->second;
-                    std::string layer_type = layer_params["type"];
+                    std::string layer_type = layer_params["layer_type"];
 
                     if (layer_type == "convolutional" || layer_type == "connected")
                     {
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 0577e2c834..2dd06c50f5 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -758,6 +758,18 @@ struct DataLayer : public Layer
     void setNames(const std::vector<String> &names)
     {
         outNames.assign(names.begin(), names.end());
+        shapes.clear(); shapes.resize(outNames.size());
+    }
+
+    void setInputShape(const String& tgtName, const MatShape& shape)
+    {
+        std::vector<String>::const_iterator it = std::find(outNames.begin(), outNames.end(), tgtName);
+        CV_Check(tgtName, it != outNames.end(), "Unknown input");
+        int idx = (int)(it - outNames.begin());
+
+        CV_Assert(idx < (int)shapes.size());
+        CV_Check(tgtName, shapes[idx].empty(), "Input shape redefinition is not allowed");
+        shapes[idx] = shape;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -820,6 +832,7 @@ struct DataLayer : public Layer
 #endif  // HAVE_INF_ENGINE
 
     std::vector<String> outNames;
+    std::vector<MatShape> shapes;
     // Preprocessing parameters for each network's input.
     std::vector<double> scaleFactors;
     std::vector<Scalar> means;
@@ -2012,7 +2025,9 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
+                    std::string outputName = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
+                    outputName = ld.outputBlobsWrappers.size() > 1 ? (outputName + "." + std::to_string(i)) : outputName;
+                    dataPtr->setName(outputName);
                 }
             }
             else
@@ -2020,7 +2035,8 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    dataPtr->setName(ld.name);
+                    std::string outputName = ld.outputBlobsWrappers.size() > 1 ? (ld.name + "." + std::to_string(i)) : ld.name;
+                    dataPtr->setName(outputName);
                 }
             }
         }
@@ -2061,6 +2077,9 @@ struct Net::Impl
             return;
         }
 
+        bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
+                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU);
+
         // Build Inference Engine networks from sets of layers that support this
         // backend. Split a whole model on several Inference Engine networks if
         // some of layers are not implemented.
@@ -2075,20 +2094,47 @@ struct Net::Impl
             Ptr<Layer> layer = ld.layerInstance;
             if (!fused && !layer->supportBackend(preferableBackend))
             {
-                addNgraphOutputs(ld);
-                net = Ptr<InfEngineNgraphNet>();
-                layer->preferableTarget = DNN_TARGET_CPU;
+                bool customizable = ld.id != 0 && supportsCPUFallback;
 
-                for (int i = 0; i < ld.inputBlobsId.size(); ++i)
+                // TODO: there is a bug in Myriad plugin with custom layers shape infer.
+                if (preferableTarget == DNN_TARGET_MYRIAD)
                 {
-                    LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
-                    Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
-                    if (!inpNode.empty()) {
-                        Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
-                        ieNode->net->setUnconnectedNodes(ieNode);
+                    for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
+                    {
+                        customizable = ld.inputBlobs[i]->size[0] == 1;
                     }
                 }
-                continue;
+
+                // TODO: fix these workarounds
+                if (preferableTarget == DNN_TARGET_MYRIAD ||
+                    preferableTarget == DNN_TARGET_OPENCL ||
+                    preferableTarget == DNN_TARGET_OPENCL_FP16)
+                    customizable &= ld.type != "Concat";
+
+                if (preferableTarget == DNN_TARGET_OPENCL ||
+                    preferableTarget == DNN_TARGET_OPENCL_FP16)
+                    customizable &= ld.type != "Power";
+
+                if (preferableTarget == DNN_TARGET_OPENCL)
+                    customizable &= ld.type != "Eltwise";
+
+                if (!customizable)
+                {
+                    addNgraphOutputs(ld);
+                    net = Ptr<InfEngineNgraphNet>();
+                    layer->preferableTarget = DNN_TARGET_CPU;
+
+                    for (int i = 0; i < ld.inputBlobsId.size(); ++i)
+                    {
+                        LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
+                        Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
+                        if (!inpNode.empty()) {
+                            Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
+                            ieNode->net->setUnconnectedNodes(ieNode);
+                        }
+                    }
+                    continue;
+                }
             }
             ld.skip = true;  // Initially skip all Inference Engine supported layers.
 
@@ -2162,12 +2208,32 @@ struct Net::Impl
 
             if (!fused)
             {
-                CV_Assert(!inputNodes.empty());
-                node = layer->initNgraph(ld.inputBlobsWrappers, inputNodes);
-                for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                CV_Assert(ld.inputBlobsId.size() == inputNodes.size());
+                for (int i = 0; i < ld.inputBlobsId.size(); ++i)
                 {
-                    InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    node.dynamicCast<InfEngineNgraphNode>()->setName(dataPtr->getName());
+                    int lid = ld.inputBlobsId[i].lid;
+                    int oid = ld.inputBlobsId[i].oid;
+                    if (oid == 0 || lid == 0)
+                        continue;
+
+                    auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
+                    CV_Assert(oid < ieInpNode->node->get_output_size());
+                    inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid, false)));
+                }
+
+                if (layer->supportBackend(preferableBackend))
+                {
+                    node = layer->initNgraph(ld.inputBlobsWrappers, inputNodes);
+                    for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                    {
+                        InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
+                        node.dynamicCast<InfEngineNgraphNode>()->setName(dataPtr->getName());
+                    }
+                }
+                else
+                {
+                    node = Ptr<BackendNode>(new InfEngineNgraphNode(inputNodes,
+                        ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
                 }
             }
             else if (node.empty())
@@ -3145,8 +3211,25 @@ struct Net::Impl
             }
             else
             {
-                inOutShapes[0].out.clear();
-                return;
+                const std::vector<MatShape>& inputShapes = netInputLayer->shapes;
+                bool none = true;
+                for (size_t i = 0; i < inputShapes.size(); i++)
+                {
+                    if (!inputShapes[i].empty())
+                    {
+                        none = false;
+                        break;
+                    }
+                }
+                if (none)
+                {
+                    inOutShapes[0].out.clear();
+                    return;
+                }
+                else
+                {
+                    inOutShapes[0].in = inputShapes;
+                }
             }
         }
 
@@ -3372,7 +3455,7 @@ Net Net::Impl::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNe
     // set empty input to determine input shapes
     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
     {
-        cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
+        cvNet.setInputShape(inputsNames[inp_id], inp_shapes[inp_id]);
     }
 
     Ptr<BackendNode> backendNode;
@@ -3798,6 +3881,13 @@ void Net::setInputsNames(const std::vector<String> &inputBlobNames)
     impl->netInputLayer->setNames(inputBlobNames);
 }
 
+void Net::setInputShape(const String &inputName, const MatShape& shape)
+{
+    CV_TRACE_FUNCTION();
+
+    impl->netInputLayer->setInputShape(inputName, shape);
+}
+
 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
 {
     CV_TRACE_FUNCTION();
@@ -3810,6 +3900,33 @@ void Net::setInput(InputArray blob, const String& name, double scalefactor, cons
     if (!pin.valid())
         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
 
+    Mat blob_ = blob.getMat();  // can't use InputArray directly due MatExpr stuff
+    MatShape blobShape = shape(blob_);
+
+    if (pin.lid == 0)
+    {
+        CV_Assert(!impl->netInputLayer.empty());
+        const DataLayer& netInputLayer = *impl->netInputLayer.get();
+        if (!netInputLayer.shapes.empty())
+        {
+            CV_CheckLT(pin.oid, (int)netInputLayer.shapes.size(), "");
+            const MatShape& inputShapeLimitation = netInputLayer.shapes[pin.oid];
+            if (!inputShapeLimitation.empty())
+            {
+                CV_CheckEQ(inputShapeLimitation.size(), blobShape.size(), "");
+#if 0  // TODO: DNNTestNetwork.MobileNet_SSD_Caffe_Different_Width_Height/0
+                const size_t dims = inputShapeLimitation.size();
+                for (size_t dim = 0; dim < dims; dim++)
+                {
+                    if (dims >= 3 && dim == 0 && inputShapeLimitation[0] == 1)
+                        continue;  // don't limit batch
+                    CV_CheckEQ(inputShapeLimitation[dim], blobShape[dim], "");
+                }
+#endif
+            }
+        }
+    }
+
     LayerData &ld = impl->layers[pin.lid];
     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
     ld.outputBlobs.resize(numInputs);
@@ -3819,17 +3936,11 @@ void Net::setInput(InputArray blob, const String& name, double scalefactor, cons
     impl->netInputLayer->means.resize(numInputs);
 
     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
-    Mat blob_ = blob.getMat();
-    bool oldShape = prevShape == shape(blob_);
-    if (oldShape)
-    {
-        blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
-    }
-    else
-    {
-        ld.outputBlobs[pin.oid] = blob_.clone();
-        impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
-    }
+    bool oldShape = prevShape == blobShape;
+
+    blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
+    if (!oldShape)
+        ld.outputBlobs[pin.oid] = impl->netInputLayer->inputsData[pin.oid];
 
     if (!ld.outputBlobsWrappers[pin.oid].empty())
     {
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index d7df547412..e8cfd1265e 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -26,6 +26,35 @@ namespace cv { namespace dnn {
 // OpenCV lets users use an empty input name and to prevent unexpected naming,
 // we can use some predefined name.
 static std::string kDefaultInpLayerName = "empty_inp_layer_name";
+static constexpr const char* kOpenCVLayersType = "OpenCVLayer";
+
+static std::string shapesToStr(const std::vector<Mat>& mats)
+{
+    std::ostringstream shapes;
+    shapes << mats.size() << " ";
+    for (const Mat& m : mats)
+    {
+        shapes << m.dims << " ";
+        for (int i = 0; i < m.dims; ++i)
+            shapes << m.size[i] << " ";
+    }
+    return shapes.str();
+}
+
+static void strToShapes(const std::string& str, std::vector<std::vector<size_t> >& shapes)
+{
+    std::istringstream ss(str);
+    int num, dims;
+    ss >> num;
+    shapes.resize(num);
+    for (int i = 0; i < num; ++i)
+    {
+        ss >> dims;
+        shapes[i].resize(dims);
+        for (int j = 0; j < dims; ++j)
+            ss >> shapes[i][j];
+    }
+}
 
 static std::vector<Ptr<NgraphBackendWrapper> >
 ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
@@ -40,12 +69,82 @@ ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
     return wrappers;
 }
 
+class NgraphCustomOp: public ngraph::op::Op {
+public:
+    const ngraph::NodeTypeInfo& get_type_info() const override
+    {
+        static constexpr ngraph::NodeTypeInfo type_info{kOpenCVLayersType, 0};
+        return type_info;
+    }
+
+    NgraphCustomOp() {};
+    NgraphCustomOp(const ngraph::NodeVector& inputs,
+                   const std::map<std::string, InferenceEngine::Parameter>& params = {}):
+        Op(inputs), params(params)
+    {
+        constructor_validate_and_infer_types();
+    }
+
+    void validate_and_infer_types() override
+    {
+        std::vector<std::vector<size_t> > shapes;
+        strToShapes(params["outputs"], shapes);
+        set_output_size(shapes.size());
+        for (size_t i = 0; i < shapes.size(); ++i)
+        {
+            ngraph::Shape output_shape(shapes[i]);
+            set_output_type(i, get_input_element_type(0), output_shape);
+        }
+    }
+
+    std::shared_ptr<ngraph::Node> copy_with_new_args(const ngraph::NodeVector& new_args) const override
+    {
+        return std::make_shared<NgraphCustomOp>(new_args, params);
+    }
+
+    bool visit_attributes(ngraph::AttributeVisitor& visitor) override
+    {
+        for (auto& attr : params)
+        {
+            if (attr.second.is<std::string>())
+                visitor.on_attribute(attr.first, attr.second.as<std::string>());
+        }
+        return true;
+    }
+
+private:
+    std::map<std::string, InferenceEngine::Parameter> params;
+};
+
 InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
 
 InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>& _node)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
 
+InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
+                                         Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
+                                         std::vector<Mat>& outputs, std::vector<Mat>& internals)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), cvLayer(cvLayer_)
+{
+    std::ostringstream oss;
+    oss << (size_t)cvLayer.get();
+
+    std::map<std::string, InferenceEngine::Parameter> params = {
+        {"impl", oss.str()},
+        {"outputs", shapesToStr(outputs)},
+        {"internals", shapesToStr(internals)}
+    };
+
+    ngraph::NodeVector inp_nodes;
+    for (const auto& node : nodes)
+        inp_nodes.emplace_back(node.dynamicCast<InfEngineNgraphNode>()->node);
+    node = std::make_shared<NgraphCustomOp>(inp_nodes, params);
+
+    CV_Assert(!cvLayer->name.empty());
+    setName(cvLayer->name);
+}
+
 void InfEngineNgraphNode::setName(const std::string& name) {
     node->set_friendly_name(name);
 }
@@ -342,7 +441,24 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
         if (device_name == "MYRIAD") {
             config.emplace("VPU_DETECT_NETWORK_BATCH", CONFIG_VALUE(NO));
         }
-        netExec = ie.LoadNetwork(net, device_name, config);
+
+        bool isHetero = false;
+        if (device_name != "CPU")
+        {
+            isHetero = device_name == "FPGA";
+            for (auto& layer : net)
+            {
+                if (layer->type == kOpenCVLayersType)
+                {
+                    isHetero = true;
+                    break;
+                }
+            }
+        }
+        if (isHetero)
+            netExec = ie.LoadNetwork(net, "HETERO:" + device_name + ",CPU", config);
+        else
+            netExec = ie.LoadNetwork(net, device_name, config);
     }
     catch (const std::exception& ex)
     {
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index c24839dc67..3058178cbe 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -90,6 +90,10 @@ private:
 class InfEngineNgraphNode : public BackendNode
 {
 public:
+    InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes, Ptr<Layer>& layer,
+                        std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
+                        std::vector<Mat>& internals);
+
     InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
     InfEngineNgraphNode(std::shared_ptr<ngraph::Node>& _node);
 
@@ -98,6 +102,7 @@ public:
     // Inference Engine network object that allows to obtain the outputs of this layer.
     std::shared_ptr<ngraph::Node> node;
     Ptr<InfEngineNgraphNet> net;
+    Ptr<dnn::Layer> cvLayer;
 };
 
 class NgraphBackendWrapper : public BackendWrapper
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index e8435c67b0..e72b87e917 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -9,6 +9,7 @@
 #include "../op_inf_engine.hpp"
 #include "../op_cuda.hpp"
 #include "layers_common.hpp"
+#include "../ie_ngraph.hpp"
 
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
@@ -34,6 +35,7 @@ public:
     {
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
                backendId == DNN_BACKEND_CUDA;
     }
 
@@ -82,6 +84,17 @@ public:
     }
 #endif  // HAVE_INF_ENGINE
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           getShape<size_t>(blobs[0]),
+                                                           blobs[0].data);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
+    }
+#endif  // HAVE_INF_ENGINE
+
 #ifdef HAVE_CUDA
     Ptr<BackendNode> initCUDA(
         void *context_,
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index c94093a550..23a804c92a 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -263,7 +263,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
     float scoreDiff = 0.0, iouDiff = 0.0;
     if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
-        scoreDiff = 0.012;
+        scoreDiff = 0.013;
         iouDiff = 0.06;
     }
     else if (target == DNN_TARGET_CUDA_FP16)
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index c030a92952..8207584837 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -168,7 +168,11 @@ typedef testing::TestWithParam<tuple<bool, Target> > Reproducibility_AlexNet;
 TEST_P(Reproducibility_AlexNet, Accuracy)
 {
     Target targetId = get<1>(GetParam());
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
     ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
 
     bool readFromMemory = get<0>(GetParam());
@@ -668,7 +672,11 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
 TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,  // utilizes ~1Gb, but huge blobs may not be allocated on 32-bit systems due memory fragmentation
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+#endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
     );
@@ -693,7 +701,11 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 TEST_P(Test_Caffe_nets, FasterRCNN_zf)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
         CV_TEST_TAG_DEBUG_LONG
     );
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 71b2a433ea..69306ab059 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -300,7 +300,14 @@ public:
 
 TEST_P(Test_Darknet_nets, YoloVoc)
 {
-    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_MEMORY_1GB);
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        CV_TEST_TAG_MEMORY_1GB,
+#endif
+        CV_TEST_TAG_LONG
+    );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16)
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 72a9f80aeb..922a44b891 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -56,7 +56,10 @@ TEST(imagesFromBlob, Regression)
 
     for (int i = 0; i < nbOfImages; i++)
     {
-        ASSERT_EQ(cv::countNonZero(inputImgs[i] != outputImgs[i]), 0);
+        EXPECT_EQ(0, cvtest::norm(inputImgs[i], outputImgs[i], NORM_INF))
+            << "i=" << i
+            << " inputImgs[i]=" << inputImgs[i].size
+            << " outputImgs[i]=" << outputImgs[i].size;
     }
 }
 
@@ -78,6 +81,65 @@ TEST(readNet, Regression)
     EXPECT_FALSE(net.empty());
 }
 
+TEST(readNet, do_not_call_setInput)  // https://github.com/opencv/opencv/issues/16618
+{
+    // 1. load network
+    const string proto = findDataFile("dnn/squeezenet_v1.1.prototxt");
+    const string model = findDataFile("dnn/squeezenet_v1.1.caffemodel", false);
+    Net net = readNetFromCaffe(proto, model);
+
+    // 2. mistake: no inputs are specified through .setInput()
+
+    // 3. try inference
+    Mat res;
+    EXPECT_THROW(
+    {
+        res = net.forward();  // no inputs after loading => should fail
+    }, cv::Exception);
+    EXPECT_TRUE(res.empty()) << res.size;
+}
+
+#ifdef HAVE_INF_ENGINE
+static
+void test_readNet_IE_do_not_call_setInput(Backend backendId)
+{
+    const Target targetId = DNN_TARGET_CPU;
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
+
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
+    else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH);
+    else
+        FAIL() << "Unknown backendId";
+
+    Net net = readNet(model, proto);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // 2. mistake: no inputs are specified through .setInput()
+
+    // 3. try inference
+    Mat res;
+    EXPECT_THROW(
+    {
+        res = net.forward();  // no inputs after loading => should fail
+    }, cv::Exception);
+    EXPECT_TRUE(res.empty()) << res.size;
+}
+
+TEST(readNet, do_not_call_setInput_IE_NN_BUILDER_2019)
+{
+    test_readNet_IE_do_not_call_setInput(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019);
+}
+TEST(readNet, do_not_call_setInput_IE_NGRAPH)
+{
+    test_readNet_IE_do_not_call_setInput(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH);
+}
+#endif  // HAVE_INF_ENGINE
+
 typedef testing::TestWithParam<tuple<Backend, Target> > dump;
 TEST_P(dump, Regression)
 {
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index df953ca79d..3262a6799d 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -461,7 +461,12 @@ public:
 
 TEST_P(Test_ONNX_nets, Alexnet)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
     const String model =  _tf("models/alexnet.onnx", false);
 
     Net net = readNetFromONNX(model);
@@ -520,7 +525,12 @@ TEST_P(Test_ONNX_nets, Googlenet)
 
 TEST_P(Test_ONNX_nets, CaffeNet)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
         && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
@@ -531,7 +541,12 @@ TEST_P(Test_ONNX_nets, CaffeNet)
 
 TEST_P(Test_ONNX_nets, RCNN_ILSVRC13)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
         && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
@@ -643,7 +658,11 @@ TEST_P(Test_ONNX_nets, MobileNet_v2)
 TEST_P(Test_ONNX_nets, LResNet100E_IR)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
         CV_TEST_TAG_DEBUG_LONG
     );
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index e25243b52d..4f96bda13c 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -163,12 +163,12 @@ TEST_P(Test_TensorFlow_layers, padding)
     runTensorFlowNet("spatial_padding");
     runTensorFlowNet("mirror_pad");
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019020000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    if (target == DNN_TARGET_MYRIAD)
     {
-        if (target == DNN_TARGET_MYRIAD)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
             applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-        if (target == DNN_TARGET_OPENCL_FP16)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
     }
 #endif
     runTensorFlowNet("keras_pad_concat");
@@ -864,6 +864,8 @@ TEST_P(Test_TensorFlow_layers, split)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     runTensorFlowNet("split");
 }
 
@@ -1002,7 +1004,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
 
     if (target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     applyTestTag(CV_TEST_TAG_MEMORY_1GB, CV_TEST_TAG_DEBUG_VERYLONG);
     Mat img = imread(findDataFile("dnn/street.png"));
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 682ecebea5..b52141913f 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -410,9 +410,12 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
         throw SkipTestException("");
     }
 #endif
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU)
     {
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        throw SkipTestException("");
     }
 
     Net net;
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 99d4e59bd9..e373926a14 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -47,6 +47,7 @@ The references are:
 #include "opencl_kernels_features2d.hpp"
 #include "hal_replacement.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
@@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
     for( i = -255; i <= 255; i++ )
         threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
 
-    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
-    uchar* buf[3];
-    buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
-    int* cpbuf[3];
-    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
-    cpbuf[1] = cpbuf[0] + img.cols + 1;
-    cpbuf[2] = cpbuf[1] + img.cols + 1;
-    memset(buf[0], 0, img.cols*3);
+    uchar* buf[3] = { 0 };
+    int* cpbuf[3] = { 0 };
+    utils::BufferArea area;
+    for (unsigned idx = 0; idx < 3; ++idx)
+    {
+        area.allocate(buf[idx], img.cols);
+        area.allocate(cpbuf[idx], img.cols + 1);
+    }
+    area.commit();
+
+    for (unsigned idx = 0; idx < 3; ++idx)
+    {
+        memset(buf[idx], 0, img.cols);
+    }
 
     for(i = 3; i < img.rows-2; i++)
     {
         const uchar* ptr = img.ptr<uchar>(i) + 3;
         uchar* curr = buf[(i - 3)%3];
-        int* cornerpos = cpbuf[(i - 3)%3];
+        int* cornerpos = cpbuf[(i - 3)%3] + 1; // cornerpos[-1] is used to store a value
         memset(curr, 0, img.cols);
         int ncorners = 0;
 
@@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 
         const uchar* prev = buf[(i - 4 + 3)%3];
         const uchar* pprev = buf[(i - 5 + 3)%3];
-        cornerpos = cpbuf[(i - 4 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3] + 1; // cornerpos[-1] is used to store a value
         ncorners = cornerpos[-1];
 
         for( k = 0; k < ncorners; k++ )
diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index c5873a7bc4..f7cd1811df 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -427,7 +427,7 @@ public class ImgprocTest extends OpenCVTestCase {
         Imgproc.convexHull(points, hull);
 
         MatOfInt expHull = new MatOfInt(
-                1, 2, 3, 0
+                0, 1, 2, 3
         );
         assertMatEqual(expHull, hull, EPS);
     }
diff --git a/modules/imgproc/src/convhull.cpp b/modules/imgproc/src/convhull.cpp
index e288f6a626..b964ca3f62 100644
--- a/modules/imgproc/src/convhull.cpp
+++ b/modules/imgproc/src/convhull.cpp
@@ -45,7 +45,7 @@
 namespace cv
 {
 
-template<typename _Tp>
+template<typename _Tp, typename _DotTp>
 static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int nsign, int sign2 )
 {
     int incr = end > start ? 1 : -1;
@@ -79,7 +79,7 @@ static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int n
             _Tp ax = array[pcur]->x - array[pprev]->x;
             _Tp bx = array[pnext]->x - array[pcur]->x;
             _Tp ay = cury - array[pprev]->y;
-            _Tp convexity = ay*bx - ax*by; // if >0 then convex angle
+            _DotTp convexity = (_DotTp)ay*bx - (_DotTp)ax*by; // if >0 then convex angle
 
             if( CV_SIGN( convexity ) == sign2 && (ax != 0 || ay != 0) )
             {
@@ -122,7 +122,13 @@ template<typename _Tp>
 struct CHullCmpPoints
 {
     bool operator()(const Point_<_Tp>* p1, const Point_<_Tp>* p2) const
-    { return p1->x < p2->x || (p1->x == p2->x && p1->y < p2->y); }
+    {
+        if( p1->x != p2->x )
+            return p1->x < p2->x;
+        if( p1->y != p2->y )
+            return p1->y < p2->y;
+        return p1 < p2;
+    }
 };
 
 
@@ -194,12 +200,12 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
         // upper half
         int *tl_stack = stack;
         int tl_count = !is_float ?
-            Sklansky_( pointer, 0, maxy_ind, tl_stack, -1, 1) :
-            Sklansky_( pointerf, 0, maxy_ind, tl_stack, -1, 1);
+            Sklansky_<int, int64>( pointer, 0, maxy_ind, tl_stack, -1, 1) :
+            Sklansky_<float, double>( pointerf, 0, maxy_ind, tl_stack, -1, 1);
         int *tr_stack = stack + tl_count;
         int tr_count = !is_float ?
-            Sklansky_( pointer, total-1, maxy_ind, tr_stack, -1, -1) :
-            Sklansky_( pointerf, total-1, maxy_ind, tr_stack, -1, -1);
+            Sklansky_<int, int64>( pointer, total-1, maxy_ind, tr_stack, -1, -1) :
+            Sklansky_<float, double>( pointerf, total-1, maxy_ind, tr_stack, -1, -1);
 
         // gather upper part of convex hull to output
         if( !clockwise )
@@ -217,12 +223,12 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
         // lower half
         int *bl_stack = stack;
         int bl_count = !is_float ?
-            Sklansky_( pointer, 0, miny_ind, bl_stack, 1, -1) :
-            Sklansky_( pointerf, 0, miny_ind, bl_stack, 1, -1);
+            Sklansky_<int, int64>( pointer, 0, miny_ind, bl_stack, 1, -1) :
+            Sklansky_<float, double>( pointerf, 0, miny_ind, bl_stack, 1, -1);
         int *br_stack = stack + bl_count;
         int br_count = !is_float ?
-            Sklansky_( pointer, total-1, miny_ind, br_stack, 1, 1) :
-            Sklansky_( pointerf, total-1, miny_ind, br_stack, 1, 1);
+            Sklansky_<int, int64>( pointer, total-1, miny_ind, br_stack, 1, 1) :
+            Sklansky_<float, double>( pointerf, total-1, miny_ind, br_stack, 1, 1);
 
         if( clockwise )
         {
@@ -250,6 +256,45 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
             hullbuf[nout++] = int(pointer[bl_stack[i]] - data0);
         for( i = br_count-1; i > 0; i-- )
             hullbuf[nout++] = int(pointer[br_stack[i]] - data0);
+
+        // try to make the convex hull indices form
+        // an ascending or descending sequence by the cyclic
+        // shift of the output sequence.
+        if( nout >= 3 )
+        {
+            int min_idx = 0, max_idx = 0, lt = 0;
+            for( i = 1; i < nout; i++ )
+            {
+                int idx = hullbuf[i];
+                lt += hullbuf[i-1] < idx;
+                if( lt > 1 && lt <= i-2 )
+                    break;
+                if( idx < hullbuf[min_idx] )
+                    min_idx = i;
+                if( idx > hullbuf[max_idx] )
+                    max_idx = i;
+            }
+            int mmdist = std::abs(max_idx - min_idx);
+            if( (mmdist == 1 || mmdist == nout-1) && (lt <= 1 || lt >= nout-2) )
+            {
+                int ascending = (max_idx + 1) % nout == min_idx;
+                int i0 = ascending ? min_idx : max_idx, j = i0;
+                if( i0 > 0 )
+                {
+                    for( i = 0; i < nout; i++ )
+                    {
+                        int curr_idx = stack[i] = hullbuf[j];
+                        int next_j = j+1 < nout ? j+1 : 0;
+                        int next_idx = hullbuf[next_j];
+                        if( i < nout-1 && (ascending != (curr_idx < next_idx)) )
+                            break;
+                        j = next_j;
+                    }
+                    if( i == nout )
+                        memcpy(hullbuf, stack, nout*sizeof(hullbuf[0]));
+                }
+            }
+        }
     }
 
     if( !returnPoints )
@@ -299,12 +344,22 @@ void convexityDefects( InputArray _points, InputArray _hull, OutputArray _defect
     int hcurr = hptr[rev_orientation ? 0 : hpoints-1];
     CV_Assert( 0 <= hcurr && hcurr < npoints );
 
+    int increasing_idx = -1;
+
     for( i = 0; i < hpoints; i++ )
     {
         int hnext = hptr[rev_orientation ? hpoints - i - 1 : i];
         CV_Assert( 0 <= hnext && hnext < npoints );
 
         Point pt0 = ptr[hcurr], pt1 = ptr[hnext];
+        if( increasing_idx < 0 )
+            increasing_idx = !(hcurr < hnext);
+        else if( increasing_idx != (hcurr < hnext))
+        {
+            CV_Error(Error::StsBadArg,
+            "The convex hull indices are not monotonous, which can be in the case when the input contour contains self-intersections");
+        }
+
         double dx0 = pt1.x - pt0.x;
         double dy0 = pt1.y - pt0.y;
         double scale = dx0 == 0 && dy0 == 0 ? 0. : 1./std::sqrt(dx0*dx0 + dy0*dy0);
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index deb8bbc02c..2fa8537c36 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -1078,7 +1078,7 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     UMat dst = _dst.getUMat();
 
     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
-    const int local_size = 16;
+    const int local_size = channels == 1 ? 16 : 8;
     char cvt[2][50];
     String buildOptions = format(
             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
@@ -1092,22 +1092,17 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows };
     size_t localThreads[2] = { (size_t)local_size, (size_t)local_size };
     ocl::Kernel k;
-    if (ocl::Device::getDefault().isIntel() && channels == 1)
+    if (type == CV_8UC1 && src.cols % 2 == 0)
     {
-        if (type == CV_8UC1 && src.cols % 2 == 0)
-        {
-            buildOptions.clear();
-            k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
-        }
-        else
-        {
-            k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
-        }
+        buildOptions.clear();
+        k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
     }
     else
-        k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+    {
+        k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
+    }
 
     if (k.empty())
         return false;
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 4de81f1f57..68f2f8e8fa 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -2154,5 +2154,157 @@ TEST(Imgproc_FitLine, regression_4903)
     EXPECT_GE(fabs(lineParam[1]), fabs(lineParam[0]) * 4) << lineParam;
 }
 
+#if 0
+#define DRAW(x) x
+#else
+#define DRAW(x)
+#endif
+
+// the Python test by @hannarud is converted to C++; see the issue #4539
+TEST(Imgproc_ConvexityDefects, ordering_4539)
+{
+    int contour[][2] =
+    {
+        {26,  9}, {25, 10}, {24, 10}, {23, 10}, {22, 10}, {21, 10}, {20, 11}, {19, 11}, {18, 11}, {17, 12},
+        {17, 13}, {18, 14}, {18, 15}, {18, 16}, {18, 17}, {19, 18}, {19, 19}, {20, 20}, {21, 21}, {21, 22},
+        {22, 23}, {22, 24}, {23, 25}, {23, 26}, {24, 27}, {25, 28}, {26, 29}, {27, 30}, {27, 31}, {28, 32},
+        {29, 32}, {30, 33}, {31, 34}, {30, 35}, {29, 35}, {30, 35}, {31, 34}, {32, 34}, {33, 34}, {34, 33},
+        {35, 32}, {35, 31}, {35, 30}, {36, 29}, {37, 28}, {37, 27}, {38, 26}, {39, 25}, {40, 24}, {40, 23},
+        {41, 22}, {42, 21}, {42, 20}, {42, 19}, {43, 18}, {43, 17}, {44, 16}, {45, 15}, {45, 14}, {46, 13},
+        {46, 12}, {45, 11}, {44, 11}, {43, 11}, {42, 10}, {41, 10}, {40,  9}, {39,  9}, {38,  9}, {37,  9},
+        {36,  9}, {35,  9}, {34,  9}, {33,  9}, {32,  9}, {31,  9}, {30,  9}, {29,  9}, {28,  9}, {27,  9}
+    };
+    int npoints = (int)(sizeof(contour)/sizeof(contour[0][0])/2);
+    Mat contour_(1, npoints, CV_32SC2, contour);
+    vector<Point> hull;
+    vector<int> hull_ind;
+    vector<Vec4i> defects;
+
+    // first, check the original contour as-is, without intermediate fillPoly/drawContours.
+    convexHull(contour_, hull_ind, false, false);
+    EXPECT_THROW( convexityDefects(contour_, hull_ind, defects), cv::Exception );
+
+    int scale = 20;
+    contour_ *= (double)scale;
+
+    Mat canvas_gray(Size(60*scale, 45*scale), CV_8U, Scalar::all(0));
+    const Point* ptptr = contour_.ptr<Point>();
+    fillPoly(canvas_gray, &ptptr, &npoints, 1, Scalar(255, 255, 255));
+
+    vector<vector<Point> > contours;
+    findContours(canvas_gray, contours, noArray(), RETR_LIST, CHAIN_APPROX_SIMPLE);
+    convexHull(contours[0], hull_ind, false, false);
+
+    // the original contour contains self-intersections,
+    // therefore convexHull does not return a monotonous sequence of points
+    // and therefore convexityDefects throws an exception
+    EXPECT_THROW( convexityDefects(contours[0], hull_ind, defects), cv::Exception );
+
+#if 1
+    // one way to eliminate the contour self-intersection in this particular case is to apply dilate(),
+    // so that the self-repeating points are not self-repeating anymore
+    dilate(canvas_gray, canvas_gray, Mat());
+#else
+    // another popular technique to eliminate such thin "hair" is to use morphological "close" operation,
+    // which is erode() + dilate()
+    erode(canvas_gray, canvas_gray, Mat());
+    dilate(canvas_gray, canvas_gray, Mat());
+#endif
+
+    // after the "fix", the newly retrieved contour should not have self-intersections,
+    // and everything should work well
+    findContours(canvas_gray, contours, noArray(), RETR_LIST, CHAIN_APPROX_SIMPLE);
+    convexHull(contours[0], hull, false, true);
+    convexHull(contours[0], hull_ind, false, false);
+
+    DRAW(Mat canvas(Size(60*scale, 45*scale), CV_8UC3, Scalar::all(0));
+        drawContours(canvas, contours, -1, Scalar(255, 255, 255), -1));
+
+    size_t nhull = hull.size();
+    ASSERT_EQ( nhull, hull_ind.size() );
+
+    if( nhull > 2 )
+    {
+        bool initial_lt = hull_ind[0] < hull_ind[1];
+        for( size_t i = 0; i < nhull; i++ )
+        {
+            int ind = hull_ind[i];
+            Point pt = contours[0][ind];
+
+            ASSERT_EQ(pt, hull[i]);
+            if( i > 0 )
+            {
+                // check that the convex hull indices are monotone
+                if( initial_lt )
+                {
+                    ASSERT_LT(hull_ind[i-1], hull_ind[i]);
+                }
+                else
+                {
+                    ASSERT_GT(hull_ind[i-1], hull_ind[i]);
+                }
+            }
+            DRAW(circle(canvas, pt, 7, Scalar(180, 0, 180), -1, LINE_AA);
+                putText(canvas, format("%d (%d)", (int)i, ind), pt+Point(15, 0), FONT_HERSHEY_SIMPLEX, 0.4, Scalar(200, 0, 200), 1, LINE_AA));
+            //printf("%d. ind=%d, pt=(%d, %d)\n", (int)i, ind, pt.x, pt.y);
+        }
+    }
+
+    convexityDefects(contours[0], hull_ind, defects);
+
+    for(size_t i = 0; i < defects.size(); i++ )
+    {
+        Vec4i d = defects[i];
+        //printf("defect %d. start=%d, end=%d, farthest=%d, depth=%d\n", (int)i, d[0], d[1], d[2], d[3]);
+        EXPECT_LT(d[0], d[1]);
+        EXPECT_LE(d[0], d[2]);
+        EXPECT_LE(d[2], d[1]);
+
+        DRAW(Point start = contours[0][d[0]];
+             Point end = contours[0][d[1]];
+             Point far = contours[0][d[2]];
+             line(canvas, start, end, Scalar(255, 255, 128), 3, LINE_AA);
+             line(canvas, start, far, Scalar(255, 150, 255), 3, LINE_AA);
+             line(canvas, end, far, Scalar(255, 150, 255), 3, LINE_AA);
+             circle(canvas, start, 7, Scalar(0, 0, 255), -1, LINE_AA);
+             circle(canvas, end, 7, Scalar(0, 0, 255), -1, LINE_AA);
+             circle(canvas, far, 7, Scalar(255, 0, 0), -1, LINE_AA));
+    }
+
+    DRAW(imshow("defects", canvas);
+         waitKey());
+}
+
+#undef DRAW
+
+TEST(Imgproc_ConvexHull, overflow)
+{
+    std::vector<Point> points;
+    std::vector<Point2f> pointsf;
+
+    points.push_back(Point(14763, 2890));
+    points.push_back(Point(14388, 72088));
+    points.push_back(Point(62810, 72274));
+    points.push_back(Point(63166, 3945));
+    points.push_back(Point(56782, 3945));
+    points.push_back(Point(56763, 3077));
+    points.push_back(Point(34666, 2965));
+    points.push_back(Point(34547, 2953));
+    points.push_back(Point(34508, 2866));
+    points.push_back(Point(34429, 2965));
+
+    size_t i, n = points.size();
+    for( i = 0; i < n; i++ )
+        pointsf.push_back(Point2f(points[i]));
+
+    std::vector<int> hull;
+    std::vector<int> hullf;
+
+    convexHull(points, hull, false, false);
+    convexHull(pointsf, hullf, false, false);
+
+    ASSERT_EQ(hull, hullf);
+}
+
 }} // namespace
 /* End of file. */
diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp
index b21acd5f03..a6c75a318d 100644
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@@ -1966,7 +1966,7 @@ TEST(Imgproc_Hist_Calc, badarg)
     Mat img = cv::Mat::zeros(10, 10, CV_8UC1);
     Mat imgInt = cv::Mat::zeros(10, 10, CV_32SC1);
     Mat hist;
-    const int hist_size[] = { 100 };
+    const int hist_size[] = { 100, 100 };
     // base run
     EXPECT_NO_THROW(cv::calcHist(&img, 1, channels, noArray(), hist, 1, hist_size, ranges, true));
     // bad parameters
diff --git a/platforms/scripts/valgrind.supp b/platforms/scripts/valgrind.supp
index 3fdacb737f..1fa1fff688 100644
--- a/platforms/scripts/valgrind.supp
+++ b/platforms/scripts/valgrind.supp
@@ -43,7 +43,7 @@
 {
    OpenCV-getCoreTlsData
    Memcheck:Leak
-   fun:_Znwm
+   ...
    fun:_ZN2cv14getCoreTlsDataEv
 }
 
diff --git a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
index be0b1a8a21..8d567b2f5e 100644
--- a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
+++ b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
@@ -132,6 +132,7 @@ int main(int argc, char *argv[])
     return 0;
 }
 
+// ![get-psnr]
 double getPSNR(const Mat& I1, const Mat& I2)
 {
     Mat s1;
@@ -152,6 +153,9 @@ double getPSNR(const Mat& I1, const Mat& I2)
         return psnr;
     }
 }
+// ![get-psnr]
+
+// ![get-mssim]
 
 Scalar getMSSIM( const Mat& i1, const Mat& i2)
 {
@@ -205,3 +209,4 @@ Scalar getMSSIM( const Mat& i1, const Mat& i2)
     Scalar mssim = mean(ssim_map);   // mssim = average of ssim map
     return mssim;
 }
+// ![get-mssim]
diff --git a/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
new file mode 100644
index 0000000000..84610d4768
--- /dev/null
+++ b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+import argparse
+import sys
+
+# [get-psnr]
+def getPSNR(I1, I2):
+    s1 = cv.absdiff(I1, I2) #|I1 - I2|
+    s1 = np.float32(s1)     # cannot make a square on 8 bits
+    s1 = s1 * s1            # |I1 - I2|^2
+    sse = s1.sum()          # sum elements per channel
+    if sse <= 1e-10:        # sum channels
+        return 0            # for small values return zero
+    else:
+        shape = I1.shape
+        mse = 1.0 * sse / (shape[0] * shape[1] * shape[2])
+        psnr = 10.0 * np.log10((255 * 255) / mse)
+        return psnr
+# [get-psnr]
+
+# [get-mssim]
+def getMSSISM(i1, i2):
+    C1 = 6.5025
+    C2 = 58.5225
+    # INITS
+
+    I1 = np.float32(i1) # cannot calculate on one byte large values
+    I2 = np.float32(i2)
+
+    I2_2 = I2 * I2 # I2^2
+    I1_2 = I1 * I1 # I1^2
+    I1_I2 = I1 * I2 # I1 * I2
+    # END INITS
+
+    # PRELIMINARY COMPUTING
+    mu1 = cv.GaussianBlur(I1, (11, 11), 1.5)
+    mu2 = cv.GaussianBlur(I2, (11, 11), 1.5)
+
+    mu1_2 = mu1 * mu1
+    mu2_2 = mu2 * mu2
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_2 = cv.GaussianBlur(I1_2, (11, 11), 1.5)
+    sigma1_2 -= mu1_2
+
+    sigma2_2 = cv.GaussianBlur(I2_2, (11, 11), 1.5)
+    sigma2_2 -= mu2_2
+
+    sigma12 = cv.GaussianBlur(I1_I2, (11, 11), 1.5)
+    sigma12 -= mu1_mu2
+
+    t1 = 2 * mu1_mu2 + C1
+    t2 = 2 * sigma12 + C2
+    t3 = t1 * t2                    # t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
+
+    t1 = mu1_2 + mu2_2 + C1
+    t2 = sigma1_2 + sigma2_2 + C2
+    t1 = t1 * t2                    # t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
+
+    ssim_map = cv.divide(t3, t1)    # ssim_map =  t3./t1;
+
+    mssim = cv.mean(ssim_map)       # mssim = average of ssim map
+    return mssim
+# [get-mssim]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--delay", type=int, default=30, help=" Time delay")
+    parser.add_argument("-v", "--psnrtriggervalue", type=int, default=30, help="PSNR Trigger Value")
+    parser.add_argument("-r", "--ref", type=str, default="Megamind.avi", help="Path to reference video")
+    parser.add_argument("-t", "--undertest", type=str, default="Megamind_bugy.avi",
+                        help="Path to the video to be tested")
+    args = parser.parse_args()
+
+    sourceReference = args.ref
+    sourceCompareWith = args.undertest
+    delay = args.delay
+    psnrTriggerValue = args.psnrtriggervalue
+
+    framenum = -1 # Frame counter
+
+    captRefrnc = cv.VideoCapture(sourceReference)
+    captUndTst = cv.VideoCapture(sourceCompareWith)
+
+    if not captRefrnc.isOpened():
+        print("Could not open the reference " + sourceReference)
+        sys.exit(-1)
+    if not captUndTst.isOpened():
+        print("Could not open case test " + sourceCompareWith)
+        sys.exit(-1)
+
+    refS = (int(captRefrnc.get(cv.CAP_PROP_FRAME_WIDTH)), int(captRefrnc.get(cv.CAP_PROP_FRAME_HEIGHT)))
+    uTSi = (int(captUndTst.get(cv.CAP_PROP_FRAME_WIDTH)), int(captUndTst.get(cv.CAP_PROP_FRAME_HEIGHT)))
+
+    if refS != uTSi:
+        print("Inputs have different size!!! Closing.")
+        sys.exit(-1)
+
+    WIN_UT = "Under Test"
+    WIN_RF = "Reference"
+
+    cv.namedWindow(WIN_RF, cv.WINDOW_AUTOSIZE)
+    cv.namedWindow(WIN_UT, cv.WINDOW_AUTOSIZE)
+    cv.moveWindow(WIN_RF, 400, 0) #750,  2 (bernat =0)
+    cv.moveWindow(WIN_UT, refS[0], 0) #1500, 2
+
+    print("Reference frame resolution: Width={} Height={} of nr#: {}".format(refS[0], refS[1],
+                                                                             captRefrnc.get(cv.CAP_PROP_FRAME_COUNT)))
+    print("PSNR trigger value {}".format(psnrTriggerValue))
+
+    while True: # Show the image captured in the window and repeat
+        _, frameReference = captRefrnc.read()
+        _, frameUnderTest = captUndTst.read()
+
+        if frameReference is None or frameUnderTest is None:
+            print(" < < <  Game over!  > > > ")
+            break
+
+        framenum += 1
+        psnrv = getPSNR(frameReference, frameUnderTest)
+        print("Frame: {}# {}dB".format(framenum, round(psnrv, 3)), end=" ")
+
+        if (psnrv < psnrTriggerValue and psnrv):
+            mssimv = getMSSISM(frameReference, frameUnderTest)
+            print("MSSISM: R {}% G {}% B {}%".format(round(mssimv[2] * 100, 2), round(mssimv[1] * 100, 2),
+                                                     round(mssimv[0] * 100, 2)), end=" ")
+
+        print()
+
+        cv.imshow(WIN_RF, frameReference)
+        cv.imshow(WIN_UT, frameUnderTest)
+
+        k = cv.waitKey(delay)
+        if k == 27:
+            break
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()