Merge remote-tracking branch 'upstream/3.4' into merge-3.4

5 years ago · aa2777ed61
parent 8a588f2c0b 1dd24eeec0
commit aa2777ed61
26 changed files with 1517 additions and 1313 deletions
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@ -25,6 +25,16 @@ if(WITH_IPP)
    elseif(ANDROID AND NOT OPENCV_SKIP_ANDROID_IPP_FIX_2)
      set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bsymbolic ${CMAKE_SHARED_LINKER_FLAGS}")
    endif()
+
+    if(OPENCV_FORCE_IPP_EXCLUDE_LIBS
+        OR (HAVE_IPP_ICV
+            AND UNIX AND NOT ANDROID AND NOT APPLE
+            AND (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        )
+        AND NOT OPENCV_SKIP_IPP_EXCLUDE_LIBS
+    )
+      set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,libippicv.a -Wl,--exclude-libs,libippiw.a ${CMAKE_SHARED_LINKER_FLAGS}")
+    endif()
  endif()
 endif()

--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@ -48,8 +48,10 @@
 #include "precomp.hpp"
 #include <stdio.h>
 #include <limits>
+#include <vector>
 #include "opencl_kernels_calib3d.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"

 namespace cv
 {
@ -85,6 +87,19 @@ struct StereoBMParams
    Rect roi1, roi2;
    int disp12MaxDiff;
    int dispType;
+
+    inline bool useShorts() const
+    {
+        return preFilterCap <= 31 && SADWindowSize <= 21;
+    }
+    inline bool useFilterSpeckles() const
+    {
+        return speckleRange >= 0 && speckleWindowSize > 0;
+    }
+    inline bool useNormPrefilter() const
+    {
+        return preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE;
+    }
 };

 #ifdef HAVE_OPENCL
@ -110,10 +125,10 @@ static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsi
 }
 #endif

-static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf )
+static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, int *buf )
 {
    int x, y, wsz2 = winsize/2;
-    int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
+    int* vsum = buf + (wsz2 + 1);
    int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
    const int OFS = 256*5, TABSZ = OFS*2 + 256;
    uchar tab[TABSZ];
@ -309,13 +324,77 @@ inline int dispDescale(int v1, int v2, int d)
    return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
 }

+
+class BufferBM
+{
+    static const int TABSZ = 256;
+public:
+    std::vector<int*> sad;
+    std::vector<int*> hsad;
+    std::vector<int*> htext;
+    std::vector<uchar*> cbuf0;
+    std::vector<ushort*> sad_short;
+    std::vector<ushort*> hsad_short;
+    int *prefilter[2];
+    uchar tab[TABSZ];
+private:
+    utils::BufferArea area;
+
+public:
+    BufferBM(size_t nstripes, size_t width, size_t height, const StereoBMParams& params)
+        : sad(nstripes, NULL),
+        hsad(nstripes, NULL),
+        htext(nstripes, NULL),
+        cbuf0(nstripes, NULL),
+        sad_short(nstripes, NULL),
+        hsad_short(nstripes, NULL)
+    {
+        const int wsz = params.SADWindowSize;
+        const int ndisp = params.numDisparities;
+        const int ftzero = params.preFilterCap;
+        for (size_t i = 0; i < nstripes; ++i)
+        {
+            // 1D: [1][  ndisp  ][1]
+#if CV_SIMD
+            if (params.useShorts())
+                area.allocate(sad_short[i], ndisp + 2);
+            else
+#endif
+                area.allocate(sad[i], ndisp + 2);
+
+            // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
+#if CV_SIMD
+            if (params.useShorts())
+                area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
+            else
+#endif
+                area.allocate(hsad[i], (height + wsz + 2) * ndisp);
+
+            // 1D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ]
+            area.allocate(htext[i], (height + wsz + 2));
+
+            // 3D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ] * [ wsz/2 + 1 ][ wsz/2 + 1 ]
+            area.allocate(cbuf0[i], ((height + wsz + 2) * ndisp * (wsz + 2) + 256));
+        }
+        if (params.useNormPrefilter())
+        {
+            for (size_t i = 0; i < 2; ++i)
+                area.allocate(prefilter[0], width + params.preFilterSize + 2);
+        }
+        area.commit();
+
+        // static table
+        for (int x = 0; x < TABSZ; x++)
+            tab[x] = (uchar)std::abs(x - ftzero);
+    }
+};
+
 #if CV_SIMD
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
-                                            Mat& disp, Mat& cost, StereoBMParams& state,
-                                            uchar* buf, int _dy0, int _dy1 )
+                                            Mat& disp, Mat& cost, const StereoBMParams& state,
+                                            int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
 {
-    const int ALIGN = CV_SIMD_WIDTH;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -325,15 +404,13 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
-    int ftzero = state.preFilterCap;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
    const int disp_shift = dispShiftTemplate<dType>::value;
    dType FILTERED = (dType)((mindisp - 1) << disp_shift);

-    ushort *sad, *hsad0, *hsad, *hsad_sub;
-    int *htext;
-    uchar *cbuf0, *cbuf;
+    ushort *hsad, *hsad_sub;
+    uchar *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
    const uchar* rptr0 = right.ptr() + rofs;
    const uchar *lptr, *lptr_sub, *rptr;
@ -343,23 +420,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
    int cstep = (height + dy0 + dy1)*ndisp;
    short costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
-    const int TABSZ = 256;
-    uchar tab[TABSZ];
+    const uchar * tab = bufX.tab;
    short v_seq[v_int16::nlanes];
    for (short i = 0; i < v_int16::nlanes; ++i)
        v_seq[i] = i;

-    sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
-    hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
-    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
-    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
-
-    for( x = 0; x < TABSZ; x++ )
-        tab[x] = (uchar)std::abs(x - ftzero);
+    ushort *sad = bufX.sad_short[bufNum] + 1;
+    ushort *hsad0 = bufX.hsad_short[bufNum] + (wsz2 + 1) * ndisp;
+    int *htext = bufX.htext[bufNum] + (wsz2 + 1);
+    uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;

    // initialize buffers
-    memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
-    memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
+    memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
+    memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
+    memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));

    for( x = -wsz2-1; x < wsz2; x++ )
    {
@ -594,10 +668,9 @@ template <typename mType>
 static void
 findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                            Mat& disp, Mat& cost, const StereoBMParams& state,
-                            uchar* buf, int _dy0, int _dy1 )
+                            int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
 {

-    const int ALIGN = CV_SIMD_WIDTH;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -607,14 +680,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
-    int ftzero = state.preFilterCap;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
    const int disp_shift = dispShiftTemplate<mType>::value;
    mType FILTERED = (mType)((mindisp - 1) << disp_shift);

-    int *sad, *hsad0, *hsad, *hsad_sub, *htext;
-    uchar *cbuf0, *cbuf;
+    int *hsad, *hsad_sub;
+    uchar *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
    const uchar* rptr0 = right.ptr() + rofs;
    const uchar *lptr, *lptr_sub, *rptr;
@ -624,8 +696,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    int cstep = (height+dy0+dy1)*ndisp;
    int costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
-    const int TABSZ = 256;
-    uchar tab[TABSZ];
+    const uchar * tab = bufX.tab;

 #if CV_SIMD
    int v_seq[v_int32::nlanes];
@ -634,17 +705,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
 #endif

-    sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
-    hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
-    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
-    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
-
-    for( x = 0; x < TABSZ; x++ )
-        tab[x] = (uchar)std::abs(x - ftzero);
+    int *sad = bufX.sad[bufNum] + 1;
+    int *hsad0 = bufX.hsad[bufNum] + (wsz2 + 1) * ndisp;
+    int *htext = bufX.htext[bufNum] + (wsz2 + 1);
+    uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;

    // initialize buffers
-    memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
-    memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
+    memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
+    memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
+    memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));

    for( x = -wsz2-1; x < wsz2; x++ )
    {
@ -890,7 +959,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
 #ifdef HAVE_OPENCL
 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
 {
-    if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
+    if (state->useNormPrefilter())
    {
        if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap))
            return false;
@ -911,29 +980,28 @@ static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray le
 struct PrefilterInvoker : public ParallelLoopBody
 {
    PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
-                     uchar* buf0, uchar* buf1, StereoBMParams* _state)
+                     const BufferBM &bufX_, const StereoBMParams &state_)
+        : bufX(bufX_), state(state_)
    {
        imgs0[0] = &left0; imgs0[1] = &right0;
        imgs[0] = &left; imgs[1] = &right;
-        buf[0] = buf0; buf[1] = buf1;
-        state = _state;
    }

    void operator()(const Range& range) const CV_OVERRIDE
    {
        for( int i = range.start; i < range.end; i++ )
        {
-            if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
-                prefilterNorm( *imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i] );
+            if (state.useNormPrefilter())
+                prefilterNorm( *imgs0[i], *imgs[i], state.preFilterSize, state.preFilterCap, bufX.prefilter[i] );
            else
-                prefilterXSobel( *imgs0[i], *imgs[i], state->preFilterCap );
+                prefilterXSobel( *imgs0[i], *imgs[i], state.preFilterCap );
        }
    }

    const Mat* imgs0[2];
    Mat* imgs[2];
-    uchar* buf[2];
-    StereoBMParams* state;
+    const BufferBM &bufX;
+    const StereoBMParams &state;
 };

 #ifdef HAVE_OPENCL
@ -986,18 +1054,17 @@ static bool ocl_stereobm( InputArray _left, InputArray _right,
 struct FindStereoCorrespInvoker : public ParallelLoopBody
 {
    FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
-                             Mat& _disp, StereoBMParams* _state,
-                             int _nstripes, size_t _stripeBufSize,
-                             bool _useShorts, Rect _validDisparityRect,
-                             Mat& _slidingSumBuf, Mat& _cost )
+                             Mat& _disp, const StereoBMParams &_state,
+                             int _nstripes,
+                             Rect _validDisparityRect,
+                             Mat& _cost, const BufferBM & buf_ )
+        : state(_state), buf(buf_)
    {
        CV_Assert( _disp.type() == CV_16S || _disp.type() == CV_32S );
        left = &_left; right = &_right;
-        disp = &_disp; state = _state;
-        nstripes = _nstripes; stripeBufSize = _stripeBufSize;
-        useShorts = _useShorts;
+        disp = &_disp;
+        nstripes = _nstripes;
        validDisparityRect = _validDisparityRect;
-        slidingSumBuf = &_slidingSumBuf;
        cost = &_cost;
    }

@ -1006,11 +1073,10 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
        int cols = left->cols, rows = left->rows;
        int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
        int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
-        uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;

        int dispShift = disp->type() == CV_16S ? DISPARITY_SHIFT_16S :
                                                 DISPARITY_SHIFT_32S;
-        int FILTERED = (state->minDisparity - 1) << dispShift;
+        int FILTERED = (state.minDisparity - 1) << dispShift;

        Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
        if( roi.height == 0 )
@ -1033,27 +1099,27 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
        Mat left_i = left->rowRange(row0, row1);
        Mat right_i = right->rowRange(row0, row1);
        Mat disp_i = disp->rowRange(row0, row1);
-        Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
+        Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();

 #if CV_SIMD
-        if (useShorts)
+        if (state.useShorts())
        {
            if( disp_i.type() == CV_16S)
-                findStereoCorrespondenceBM_SIMD<short>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+                findStereoCorrespondenceBM_SIMD<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
            else
-                findStereoCorrespondenceBM_SIMD<int>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1);
+                findStereoCorrespondenceBM_SIMD<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start);
        }
        else
 #endif
        {
            if( disp_i.type() == CV_16S )
-                findStereoCorrespondenceBM<short>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+                findStereoCorrespondenceBM<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
            else
-                findStereoCorrespondenceBM<int>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+                findStereoCorrespondenceBM<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
        }

-        if( state->disp12MaxDiff >= 0 )
-            validateDisparity( disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff );
+        if( state.disp12MaxDiff >= 0 )
+            validateDisparity( disp_i, cost_i, state.minDisparity, state.numDisparities, state.disp12MaxDiff );

        if( roi.x > 0 )
        {
@ -1069,13 +1135,12 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody

 protected:
    const Mat *left, *right;
-    Mat* disp, *slidingSumBuf, *cost;
-    StereoBMParams *state;
+    Mat* disp, *cost;
+    const StereoBMParams &state;

    int nstripes;
-    size_t stripeBufSize;
-    bool useShorts;
    Rect validDisparityRect;
+    const BufferBM & buf;
 };

 class StereoBMImpl CV_FINAL : public StereoBM
@ -1149,7 +1214,7 @@ public:
                    disp_shift = DISPARITY_SHIFT_16S;
                    FILTERED = (params.minDisparity - 1) << disp_shift;

-                    if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
+                    if (params.useFilterSpeckles())
                        filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
                    if (dtype == CV_32F)
                        disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << disp_shift), 0);
@ -1192,44 +1257,39 @@ public:
            disp = dispbuf;
        }

-        int wsz = params.SADWindowSize;
-        int bufSize0 = (int)((ndisp + 2)*sizeof(int));
-        bufSize0 += (int)((height+wsz+2)*ndisp*sizeof(int));
-        bufSize0 += (int)((height + wsz + 2)*sizeof(int));
-        bufSize0 += (int)((height+wsz+2)*ndisp*(wsz+2)*sizeof(uchar) + 256);
-
-        int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
-        int bufSize2 = 0;
-        if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
-            bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
-
-        bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21;
-        const double SAD_overhead_coeff = 10.0;
-        double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
-        double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
-        int nstripes = cvCeil(height / maxStripeSize);
-        int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));
-
-        if( slidingSumBuf.cols < bufSize )
-            slidingSumBuf.create( 1, bufSize, CV_8U );
-
-        uchar *_buf = slidingSumBuf.ptr();
-
-        parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);
-
-        Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
-        validDisparityRect = getValidDisparityROI(!R1.empty() ? R1 : validDisparityRect,
-                                                  !R2.empty() ? R2 : validDisparityRect,
-                                                  params.minDisparity, params.numDisparities,
-                                                  params.SADWindowSize);
-
-        parallel_for_(Range(0, nstripes),
-                      FindStereoCorrespInvoker(left, right, disp, &params, nstripes,
-                                               bufSize0, useShorts, validDisparityRect,
-                                               slidingSumBuf, cost));
+        {
+            const double SAD_overhead_coeff = 10.0;
+            const double N0 = 8000000 / (params.useShorts() ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
+            const double maxStripeSize = std::min(
+                std::max(
+                    N0 / (width * ndisp),
+                    (params.SADWindowSize-1) * SAD_overhead_coeff
+                ),
+                (double)height
+            );
+            const int nstripes = cvCeil(height / maxStripeSize);
+            BufferBM localBuf(nstripes, width, height, params);
+
+            // Prefiltering
+            parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, localBuf, params), 1);
+
+
+            Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
+            validDisparityRect = getValidDisparityROI(!R1.empty() ? R1 : validDisparityRect,
+                                                      !R2.empty() ? R2 : validDisparityRect,
+                                                      params.minDisparity, params.numDisparities,
+                                                      params.SADWindowSize);
+
+            FindStereoCorrespInvoker invoker(left, right, disp, params, nstripes, validDisparityRect, cost, localBuf);
+            parallel_for_(Range(0, nstripes), invoker);
+
+            if (params.useFilterSpeckles())
+            {
+                slidingSumBuf.create( 1, width * height * (sizeof(Point_<short>) + sizeof(int) + sizeof(uchar)), CV_8U );
+                filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
+            }

-        if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
-            filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
+        }

        if (disp0.data != disp.data)
            disp.convertTo(disp0, disp0.type(), 1./(1 << disp_shift), 0);
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -326,6 +326,13 @@ enum CpuFeatures {

 #include "cv_cpu_dispatch.h"

+#if !defined(CV_STRONG_ALIGNMENT) && defined(__arm__) && !(defined(__aarch64__) || defined(_M_ARM64))
+// int*, int64* should be propertly aligned pointers on ARMv7
+#define CV_STRONG_ALIGNMENT 1
+#endif
+#if !defined(CV_STRONG_ALIGNMENT)
+#define CV_STRONG_ALIGNMENT 0
+#endif

 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -1458,16 +1458,23 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
@return register object

@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
 */
 template<typename _Tp>
 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }

 /** @brief Load register contents from memory (aligned)

-similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
+similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
 */
 template<typename _Tp>
 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
@ -1488,6 +1495,9 @@ v_int32x4 r = v_load_low(lo);
 template<typename _Tp>
 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
@ -1509,6 +1519,10 @@ v_int32x4 r = v_load_halves(lo, hi);
 template<typename _Tp>
 inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
    for( int i = 0; i < c.nlanes/2; i++ )
    {
@ -1531,6 +1545,9 @@ template<typename _Tp>
 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
 v_load_expand(const _Tp* ptr)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
@ -1552,6 +1569,9 @@ template<typename _Tp>
 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
 v_load_expand_q(const _Tp* ptr)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    typedef typename V_TypeTraits<_Tp>::q_type q_type;
    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
    for( int i = 0; i < c.nlanes; i++ )
@ -1572,6 +1592,9 @@ For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
                                                            v_reg<_Tp, n>& b)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
    {
@ -1591,6 +1614,9 @@ For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
    {
@ -1613,6 +1639,9 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
                                v_reg<_Tp, n>& d)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
    {
@ -1636,6 +1665,9 @@ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                               const v_reg<_Tp, n>& b,
                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i2;
    for( i = i2 = 0; i < n; i++, i2 += 2 )
    {
@ -1657,6 +1689,9 @@ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i3;
    for( i = i3 = 0; i < n; i++, i3 += 3 )
    {
@ -1679,6 +1714,9 @@ template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_
                                                            const v_reg<_Tp, n>& d,
                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    int i, i4;
    for( i = i4 = 0; i < n; i++, i4 += 4 )
    {
@ -1700,6 +1738,9 @@ Pointer can be unaligned. */
 template<typename _Tp, int n>
 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    for( int i = 0; i < n; i++ )
        ptr[i] = a.s[i];
 }
@ -1707,6 +1748,9 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
 template<typename _Tp, int n>
 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    v_store(ptr, a);
 }

@ -1720,6 +1764,9 @@ Scheme:
 template<typename _Tp, int n>
 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    for( int i = 0; i < (n/2); i++ )
        ptr[i] = a.s[i];
 }
@ -1734,6 +1781,9 @@ Scheme:
 template<typename _Tp, int n>
 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
    for( int i = 0; i < (n/2); i++ )
        ptr[i] = a.s[i+(n/2)];
 }
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@ -449,7 +449,7 @@ Returned value is a string containing space separated list of CPU features with

 Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
 */
-CV_EXPORTS std::string getCPUFeaturesLine();
+CV_EXPORTS_W std::string getCPUFeaturesLine();

 /** @brief Returns the number of logical CPUs available for the process.
 */
--- a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
+++ b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
@ -0,0 +1,103 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_UTILS_BUFFER_AREA_HPP
+#define OPENCV_UTILS_BUFFER_AREA_HPP
+
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/private.hpp>
+#include <opencv2/core/utility.hpp>
+#include <vector>
+
+namespace cv { namespace utils {
+
+//! @addtogroup core_utils
+//! @{
+
+/** @brief Manages memory block shared by muliple buffers.
+
+This class allows to allocate one large memory block and split it into several smaller
+non-overlapping buffers. In safe mode each buffer allocation will be performed independently,
+this mode allows dynamic memory access instrumentation using valgrind or memory sanitizer.
+
+Safe mode can be explicitly switched ON in constructor. It will also be enabled when compiling with
+memory sanitizer support or in runtime with the environment variable `OPENCV_BUFFER_AREA_ALWAYS_SAFE`.
+
+Example of usage:
+@code
+int * buf1 = 0;
+double * buf2 = 0;
+cv::util::BufferArea area;
+area.allocate(buf1, 200); // buf1 = new int[200];
+area.allocate(buf2, 1000, 64); // buf2 = new double[1000]; - aligned by 64
+area.commit();
+@endcode
+
+@note This class is considered private and should be used only in OpenCV itself. API can be changed.
+*/
+class CV_EXPORTS BufferArea
+{
+public:
+    /** @brief Class constructor.
+
+    @param safe Enable _safe_ operation mode, each allocation will be performed independently.
+    */
+    BufferArea(bool safe = false);
+
+    /** @brief Class destructor
+
+    All allocated memory well be freed. Each bound pointer will be reset to NULL.
+    */
+    ~BufferArea();
+
+    /** @brief Bind a pointer to local area.
+
+    BufferArea will store reference to the pointer and allocation parameters effectively owning the
+    pointer and allocated memory. This operation has the same parameters and does the same job
+    as the operator `new`, except allocation can be performed later during the BufferArea::commit call.
+
+    @param ptr Reference to a pointer of type T. Must be NULL
+    @param count Count of objects to be allocated, it has the same meaning as in the operator `new`.
+    @param alignment Alignment of allocated memory. same meaning as in the operator `new` (C++17).
+                     Must be divisible by sizeof(T). Must be power of two.
+
+    @note In safe mode allocation will be performed immediatly.
+    */
+    template <typename T>
+    void allocate(T*&ptr, size_t count, ushort alignment = sizeof(T))
+    {
+        CV_Assert(ptr == NULL);
+        CV_Assert(count > 0);
+        CV_Assert(alignment > 0);
+        CV_Assert(alignment % sizeof(T) == 0);
+        CV_Assert((alignment & (alignment - 1)) == 0);
+        allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment);
+    }
+
+    /** @brief Allocate memory and initialize all bound pointers
+
+    Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set
+    to point to a memory block with requested size and alignment.
+
+    @note Does nothing in safe mode as all allocations will be performed by BufferArea::allocate
+    */
+    void commit();
+
+private:
+    BufferArea(const BufferArea &); // = delete
+    BufferArea &operator=(const BufferArea &); // = delete
+    void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment);
+
+private:
+    class Block;
+    std::vector<Block> blocks;
+    void * oneBuf;
+    size_t totalSize;
+    const bool safe;
+};
+
+//! @}
+
+}} // cv::utils::
+
+#endif
--- a/modules/core/src/buffer_area.cpp
+++ b/modules/core/src/buffer_area.cpp
@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/utils/buffer_area.private.hpp"
+#include "opencv2/core/utils/configuration.private.hpp"
+
+#ifdef OPENCV_ENABLE_MEMORY_SANITIZER
+#define BUFFER_AREA_DEFAULT_MODE true
+#else
+#define BUFFER_AREA_DEFAULT_MODE false
+#endif
+
+static bool CV_BUFFER_AREA_OVERRIDE_SAFE_MODE =
+    cv::utils::getConfigurationParameterBool("OPENCV_BUFFER_AREA_ALWAYS_SAFE", BUFFER_AREA_DEFAULT_MODE);
+
+namespace cv { namespace utils {
+
+//==================================================================================================
+
+class BufferArea::Block
+{
+private:
+    inline size_t reserve_count() const
+    {
+        return alignment / type_size - 1;
+    }
+public:
+    Block(void **ptr_, ushort type_size_, size_t count_, ushort alignment_)
+        : ptr(ptr_), raw_mem(0), count(count_), type_size(type_size_), alignment(alignment_)
+    {
+        CV_Assert(ptr && *ptr == NULL);
+    }
+    void cleanup() const
+    {
+        CV_Assert(ptr && *ptr);
+        *ptr = 0;
+        if (raw_mem)
+            fastFree(raw_mem);
+    }
+    size_t getByteCount() const
+    {
+        return type_size * (count + reserve_count());
+    }
+    void real_allocate()
+    {
+        CV_Assert(ptr && *ptr == NULL);
+        const size_t allocated_count = count + reserve_count();
+        raw_mem = fastMalloc(type_size * allocated_count);
+        if (alignment != type_size)
+        {
+            *ptr = alignPtr(raw_mem, alignment);
+            CV_Assert(reinterpret_cast<size_t>(*ptr) % alignment == 0);
+            CV_Assert(static_cast<uchar*>(*ptr) + type_size * count <= static_cast<uchar*>(raw_mem) + type_size * allocated_count);
+        }
+        else
+        {
+            *ptr = raw_mem;
+        }
+    }
+    void * fast_allocate(void * buf) const
+    {
+        CV_Assert(ptr && *ptr == NULL);
+        buf = alignPtr(buf, alignment);
+        CV_Assert(reinterpret_cast<size_t>(buf) % alignment == 0);
+        *ptr = buf;
+        return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count);
+    }
+private:
+    void **ptr;
+    void * raw_mem;
+    size_t count;
+    ushort type_size;
+    ushort alignment;
+};
+
+//==================================================================================================
+
+BufferArea::BufferArea(bool safe_) :
+    oneBuf(0),
+    totalSize(0),
+    safe(safe_ || CV_BUFFER_AREA_OVERRIDE_SAFE_MODE)
+{
+}
+
+BufferArea::~BufferArea()
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+        i->cleanup();
+    if (oneBuf)
+        fastFree(oneBuf);
+}
+
+void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment)
+{
+    blocks.push_back(Block(ptr, type_size, count, alignment));
+    if (safe)
+        blocks.back().real_allocate();
+    else
+        totalSize += blocks.back().getByteCount();
+}
+
+void BufferArea::commit()
+{
+    if (!safe)
+    {
+        CV_Assert(totalSize > 0);
+        CV_Assert(oneBuf == NULL);
+        CV_Assert(!blocks.empty());
+        oneBuf = fastMalloc(totalSize);
+        void * ptr = oneBuf;
+        for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+        {
+            ptr = i->fast_allocate(ptr);
+        }
+    }
+}
+
+//==================================================================================================
+
+}} // cv::utils::
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -563,12 +563,6 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
    return *this;
 }

-#if CV_NEON && !defined(__aarch64__)
-#define CV_CHECK_ALIGNMENT 1
-#else
-#define CV_CHECK_ALIGNMENT 0
-#endif
-
 #if CV_SIMD128
 template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
@ -578,7 +572,7 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
    int width_1 = width & -v_uint8x16::nlanes;
    int i, j;

-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
    CV_Assert(isAligned<sizeof(T)>(src, dst));
 #endif

@ -630,7 +624,7 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
    int end = (int)(size.width*esz);
    int width = (end + 1)/2;

-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
    CV_Assert(isAligned<sizeof(T1)>(src, dst));
    CV_Assert(isAligned<sizeof(T2)>(src, dst));
 #endif
@ -659,7 +653,7 @@ static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
 #if CV_SIMD
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
 #endif
    if (esz == 2 * v_uint8x16::nlanes)
@ -712,7 +706,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
        }
    }
    else if (esz == 8
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
            && isAligned<sizeof(uint64)>(alignmentMark)
 #endif
    )
@ -720,7 +714,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
    }
    else if (esz == 4
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
            && isAligned<sizeof(unsigned)>(alignmentMark)
 #endif
    )
@ -728,7 +722,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
    }
    else if (esz == 2
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
            && isAligned<sizeof(ushort)>(alignmentMark)
 #endif
    )
@ -740,7 +734,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
    }
    else if (esz == 24
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
            && isAligned<sizeof(uint64_t)>(alignmentMark)
 #endif
    )
@ -766,7 +760,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
            }
        }
    }
-#if !CV_CHECK_ALIGNMENT
+#if !CV_STRONG_ALIGNMENT
    else if (esz == 12)
    {
        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
@ -815,7 +809,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
    {
        int i = 0;
 #if CV_SIMD
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
 #endif
        {
@ -827,7 +821,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                vx_store((int*)(dst1 + i), t0);
            }
        }
-#if CV_CHECK_ALIGNMENT
+#if CV_STRONG_ALIGNMENT
        else
        {
            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 #include "opencv2/core/utils/logger.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"

 #include "test_utils_tls.impl.hpp"

@ -303,4 +304,132 @@ TEST(Samples, findFile_missing)
    cv::utils::logging::setLogLevel(prev);
 }

+template <typename T>
+inline bool buffers_overlap(T * first, size_t first_num, T * second, size_t second_num)
+{
+    // cerr << "[" << (void*)first << " : " << (void*)(first + first_num) << ")";
+    // cerr << " X ";
+    // cerr << "[" << (void*)second << " : " << (void*)(second + second_num) << ")";
+    // cerr << endl;
+    bool res = false;
+    res |= (second <= first) && (first < second + second_num);
+    res |= (second < first + first_num) && (first + first_num < second + second_num);
+    return res;
+}
+
+typedef testing::TestWithParam<bool> BufferArea;
+
+TEST_P(BufferArea, basic)
+{
+    const bool safe = GetParam();
+    const size_t SZ = 3;
+    int * int_ptr = NULL;
+    uchar * uchar_ptr = NULL;
+    double * dbl_ptr = NULL;
+    {
+        cv::utils::BufferArea area(safe);
+        area.allocate(int_ptr, SZ);
+        area.allocate(uchar_ptr, SZ);
+        area.allocate(dbl_ptr, SZ);
+        area.commit();
+        ASSERT_TRUE(int_ptr != NULL);
+        ASSERT_TRUE(uchar_ptr != NULL);
+        ASSERT_TRUE(dbl_ptr != NULL);
+        EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int));
+        EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double));
+    }
+    EXPECT_TRUE(int_ptr == NULL);
+    EXPECT_TRUE(uchar_ptr == NULL);
+    EXPECT_TRUE(dbl_ptr == NULL);
+}
+
+TEST_P(BufferArea, align)
+{
+    const bool safe = GetParam();
+    const size_t SZ = 3;
+    const size_t CNT = 5;
+    typedef int T;
+    T * buffers[CNT] = {0};
+    {
+        cv::utils::BufferArea area(safe);
+        // allocate buffers with 3 elements with growing alignment (power of two)
+        for (size_t i = 0; i < CNT; ++i)
+        {
+            const ushort ALIGN = static_cast<ushort>(sizeof(T) << i);
+            EXPECT_TRUE(buffers[i] == NULL);
+            area.allocate(buffers[i], SZ, ALIGN);
+        }
+        area.commit();
+        for (size_t i = 0; i < CNT; ++i)
+        {
+            const ushort ALIGN = static_cast<ushort>(sizeof(T) << i);
+            EXPECT_TRUE(buffers[i] != NULL);
+            EXPECT_EQ((size_t)0, reinterpret_cast<size_t>(buffers[i]) % ALIGN);
+            if (i < CNT - 1)
+            {
+                SCOPED_TRACE(i);
+                EXPECT_FALSE(buffers_overlap(buffers[i], SZ, buffers[i + 1], SZ))
+                    << "Buffers overlap: "
+                    << buffers[i] << " (" << SZ << " elems)"
+                    << " and "
+                    << buffers[i + 1] << " (" << SZ << " elems)"
+                    << " (element size: " << sizeof(T) << ")";
+            }
+        }
+    }
+    for (size_t i = 0; i < CNT; ++i)
+    {
+        EXPECT_TRUE(buffers[i] == NULL);
+    }
+}
+
+TEST_P(BufferArea, default_align)
+{
+    const bool safe = GetParam();
+    const size_t CNT = 100;
+    const ushort ALIGN = 64;
+    typedef int T;
+    T * buffers[CNT] = {0};
+    {
+        cv::utils::BufferArea area(safe);
+        // allocate buffers with 1-99 elements with default alignment
+        for (size_t i = 0; i < CNT; ++ i)
+        {
+            EXPECT_TRUE(buffers[i] == NULL);
+            area.allocate(buffers[i], i + 1, ALIGN);
+        }
+        area.commit();
+        for (size_t i = 0; i < CNT; ++i)
+        {
+            EXPECT_TRUE(buffers[i] != NULL);
+            EXPECT_EQ((size_t)0, reinterpret_cast<size_t>(buffers[i]) % ALIGN);
+            if (i < CNT - 1)
+            {
+                SCOPED_TRACE(i);
+                EXPECT_FALSE(buffers_overlap(buffers[i], i + 1, buffers[i + 1], i + 2))
+                    << "Buffers overlap: "
+                    << buffers[i] << " (" << i + 1 << " elems)"
+                    << " and "
+                    << buffers[i + 1] << " (" << i + 2 << " elems)"
+                    << " (element size: " << sizeof(T) << ")";
+            }
+        }
+    }
+}
+
+TEST_P(BufferArea, bad)
+{
+    const bool safe = GetParam();
+    int * ptr = 0;
+    cv::utils::BufferArea area(safe);
+    EXPECT_ANY_THROW(area.allocate(ptr, 0)); // bad size
+    EXPECT_ANY_THROW(area.allocate(ptr, 1, 0)); // bad alignment
+    EXPECT_ANY_THROW(area.allocate(ptr, 1, 3)); // bad alignment
+    ptr = (int*)1;
+    EXPECT_ANY_THROW(area.allocate(ptr, 1)); // non-zero pointer
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, BufferArea, testing::Values(true, false));
+
+
 }} // namespace
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -67,6 +67,7 @@
 //M*/

 #include "../precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 #include <iostream>
 #include <fstream>
@ -109,6 +110,26 @@ namespace cv {
                    params.blobs = blobs;
                }

+                void setBatchNorm()
+                {
+                    cv::dnn::LayerParams bn_param;
+
+                    bn_param.name = "BatchNorm-name";
+                    bn_param.type = "BatchNorm";
+                    bn_param.set<bool>("has_weight", true);
+                    bn_param.set<bool>("has_bias", true);
+                    bn_param.set<float>("eps", 1E-6);	// .000001f in Darknet Yolo
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("bn_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = bn_param.type;
+                    lp.layerParams = bn_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+                }
+
                cv::dnn::LayerParams getParamConvolution(int kernel, int pad,
                    int stride, int filters_num)
                {
@ -149,25 +170,47 @@ namespace cv {
                    net->layers.push_back(lp);

                    if (use_batch_normalize)
-                    {
-                        cv::dnn::LayerParams bn_param;
-
-                        bn_param.name = "BatchNorm-name";
-                        bn_param.type = "BatchNorm";
-                        bn_param.set<bool>("has_weight", true);
-                        bn_param.set<bool>("has_bias", true);
-                        bn_param.set<float>("eps", 1E-6);	// .000001f in Darknet Yolo
-
-                        darknet::LayerParameter lp;
-                        std::string layer_name = cv::format("bn_%d", layer_id);
-                        lp.layer_name = layer_name;
-                        lp.layer_type = bn_param.type;
-                        lp.layerParams = bn_param;
-                        lp.bottom_indexes.push_back(last_layer);
-                        last_layer = layer_name;
-                        net->layers.push_back(lp);
+                        setBatchNorm();
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                cv::dnn::LayerParams getParamFullyConnected(int output)
+                {
+                    cv::dnn::LayerParams params;
+                    params.name = "FullyConnected-name";
+                    params.type = "InnerProduct";
+
+                    params.set<bool>("bias_term", false);	// true only if(BatchNorm == false)
+                    params.set<int>("num_output", output);
+
+                    return params;
+                }
+
+                void setFullyConnected(int output, int use_batch_normalize)
+                {
+                    cv::dnn::LayerParams fullyconnected_param =
+                        getParamFullyConnected(output);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("fullyConnected_%d", layer_id);
+
+                    // use BIAS in any case
+                    if (!use_batch_normalize) {
+                        fullyconnected_param.set<bool>("bias_term", true);
                    }

+                    lp.layer_name = layer_name;
+                    lp.layer_type = fullyconnected_param.type;
+                    lp.layerParams = fullyconnected_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    if (use_batch_normalize)
+                        setBatchNorm();
+
                    layer_id++;
                    fused_layer_names.push_back(last_layer);
                }
@ -191,18 +234,21 @@ namespace cv {
                    fused_layer_names.back() = last_layer;
                }

-                void setMaxpool(size_t kernel, size_t pad, size_t stride)
+                void setMaxpool(int kernel, int pad, int stride)
                {
                    cv::dnn::LayerParams maxpool_param;
                    maxpool_param.set<cv::String>("pool", "max");
                    maxpool_param.set<int>("kernel_size", kernel);
-                    maxpool_param.set<int>("pad", pad);
+                    maxpool_param.set<int>("pad_l", floor((float)pad / 2));
+                    maxpool_param.set<int>("pad_r", ceil((float)pad / 2));
+                    maxpool_param.set<int>("pad_t", floor((float)pad / 2));
+                    maxpool_param.set<int>("pad_b", ceil((float)pad / 2));
+                    maxpool_param.set<bool>("ceil_mode", false);
                    maxpool_param.set<int>("stride", stride);
-                    maxpool_param.set<cv::String>("pad_mode", "SAME");
                    maxpool_param.name = "Pooling-name";
                    maxpool_param.type = "Pooling";
-                    darknet::LayerParameter lp;

+                    darknet::LayerParameter lp;
                    std::string layer_name = cv::format("pool_%d", layer_id);
                    lp.layer_name = layer_name;
                    lp.layer_type = maxpool_param.type;
@ -539,7 +585,10 @@ namespace cv {
                net->channels = getParam(net_params, "channels", 3);
                CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);

-                int current_channels = net->channels;
+                MatShape tensor_shape(3);
+                tensor_shape[0] = net->channels;
+                tensor_shape[1] = net->width;
+                tensor_shape[2] = net->height;
                net->out_channels_vec.resize(net->layers_cfg.size());

                layers_counter = -1;
@ -568,23 +617,46 @@ namespace cv {
                            padding = kernel_size / 2;

                        CV_Assert(kernel_size > 0 && filters > 0);
-                        CV_Assert(current_channels > 0);
+                        CV_Assert(tensor_shape[0] > 0);

-                        setParams.setConvolution(kernel_size, padding, stride, filters, current_channels,
+                        setParams.setConvolution(kernel_size, padding, stride, filters, tensor_shape[0],
                            batch_normalize);

-                        current_channels = filters;
+                        tensor_shape[0] = filters;
+                        tensor_shape[1] = (tensor_shape[1] - kernel_size + 2 * padding) / stride + 1;
+                        tensor_shape[2] = (tensor_shape[2] - kernel_size + 2 * padding) / stride + 1;
+                    }
+                    else if (layer_type == "connected")
+                    {
+                        int output = getParam<int>(layer_params, "output", 1);
+                        bool batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
+
+                        CV_Assert(output > 0);
+
+                        setParams.setFullyConnected(output, batch_normalize);
+
+                        if(layers_counter && tensor_shape[1] > 1)
+                            net->out_channels_vec[layers_counter-1] = total(tensor_shape);
+
+                        tensor_shape[0] = output;
+                        tensor_shape[1] = 1;
+                        tensor_shape[2] = 1;
                    }
                    else if (layer_type == "maxpool")
                    {
                        int kernel_size = getParam<int>(layer_params, "size", 2);
                        int stride = getParam<int>(layer_params, "stride", 2);
-                        int pad = getParam<int>(layer_params, "pad", 0);
-                        setParams.setMaxpool(kernel_size, pad, stride);
+                        int padding = getParam<int>(layer_params, "padding", kernel_size - 1);
+                        setParams.setMaxpool(kernel_size, padding, stride);
+
+                        tensor_shape[1] = (tensor_shape[1] - kernel_size + padding) / stride + 1;
+                        tensor_shape[2] = (tensor_shape[2] - kernel_size + padding) / stride + 1;
                    }
                    else if (layer_type == "avgpool")
                    {
                        setParams.setAvgpool();
+                        tensor_shape[1] = 1;
+                        tensor_shape[2] = 1;
                    }
                    else if (layer_type == "softmax")
                    {
@ -599,10 +671,10 @@ namespace cv {
                        CV_Assert(!bottom_layers.empty());
                        std::vector<int> layers_vec = getNumbers<int>(bottom_layers);

-                        current_channels = 0;
+                        tensor_shape[0] = 0;
                        for (size_t k = 0; k < layers_vec.size(); ++k) {
                            layers_vec[k] = layers_vec[k] >= 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
-                            current_channels += net->out_channels_vec[layers_vec[k]];
+                            tensor_shape[0] += net->out_channels_vec[layers_vec[k]];
                        }

                        if (layers_vec.size() == 1)
@ -610,10 +682,16 @@ namespace cv {
                        else
                            setParams.setConcat(layers_vec.size(), layers_vec.data());
                    }
+                    else if (layer_type == "dropout")
+                    {
+                        setParams.setIdentity(layers_counter-1);
+                    }
                    else if (layer_type == "reorg")
                    {
                        int stride = getParam<int>(layer_params, "stride", 2);
-                        current_channels = current_channels * (stride*stride);
+                        tensor_shape[0] = tensor_shape[0] * (stride * stride);
+                        tensor_shape[1] = tensor_shape[1] / stride;
+                        tensor_shape[2] = tensor_shape[2] / stride;

                        setParams.setReorg(stride);
                    }
@ -653,6 +731,8 @@ namespace cv {
                    {
                        int scaleFactor = getParam<int>(layer_params, "stride", 1);
                        setParams.setUpsample(scaleFactor);
+                        tensor_shape[1] = tensor_shape[1] * scaleFactor;
+                        tensor_shape[2] = tensor_shape[2] * scaleFactor;
                    }
                    else if (layer_type == "yolo")
                    {
@ -686,7 +766,7 @@ namespace cv {
                    else if (activation != "linear")
                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);

-                    net->out_channels_vec[layers_counter] = current_channels;
+                    net->out_channels_vec[layers_counter] = tensor_shape[0];
                }

                return true;
@ -712,7 +792,10 @@ namespace cv {
                if(transpose)
                    CV_Error(cv::Error::StsNotImplemented, "Transpose the weights (except for convolutional) is not implemented");

-                int current_channels = net->channels;
+                MatShape tensor_shape(3);
+                tensor_shape[0] = net->channels;
+                tensor_shape[1] = net->width;
+                tensor_shape[2] = net->height;
                int cv_layers_counter = -1;
                int darknet_layers_counter = -1;

@ -725,19 +808,36 @@ namespace cv {
                    std::map<std::string, std::string> &layer_params = i->second;
                    std::string layer_type = layer_params["type"];

-                    if (layer_type == "convolutional")
+                    if (layer_type == "convolutional" || layer_type == "connected")
                    {
-                        int kernel_size = getParam<int>(layer_params, "size", -1);
-                        int filters = getParam<int>(layer_params, "filters", -1);
-                        bool use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
+                        size_t weights_size;
+                        int filters;
+                        bool use_batch_normalize;
+                        cv::Mat weightsBlob;
+                        if(layer_type == "convolutional")
+                        {
+                            int kernel_size = getParam<int>(layer_params, "size", -1);
+                            filters = getParam<int>(layer_params, "filters", -1);
+                            use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
+
+                            CV_Assert(kernel_size > 0 && filters > 0);
+                            CV_Assert(tensor_shape[0] > 0);
+
+                            weights_size = filters * tensor_shape[0] * kernel_size * kernel_size;
+                            int sizes_weights[] = { filters, tensor_shape[0], kernel_size, kernel_size };
+                            weightsBlob.create(4, sizes_weights, CV_32F);
+                        }
+                        else
+                        {
+                            filters = getParam<int>(layer_params, "output", 1);
+                            use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;

-                        CV_Assert(kernel_size > 0 && filters > 0);
-                        CV_Assert(current_channels > 0);
+                            CV_Assert(filters>0);

-                        size_t const weights_size = filters * current_channels * kernel_size * kernel_size;
-                        int sizes_weights[] = { filters, current_channels, kernel_size, kernel_size };
-                        cv::Mat weightsBlob;
-                        weightsBlob.create(4, sizes_weights, CV_32F);
+                            weights_size = total(tensor_shape) * filters;
+                            int sizes_weights[] = { filters, total(tensor_shape) };
+                            weightsBlob.create(2, sizes_weights, CV_32F);
+                        }
                        CV_Assert(weightsBlob.isContinuous());

                        cv::Mat meanData_mat(1, filters, CV_32F);	// mean
@ -753,14 +853,14 @@ namespace cv {
                        }
                        ifile.read(reinterpret_cast<char *>(weightsBlob.ptr<float>()), sizeof(float)*weights_size);

-                        // set convolutional weights
-                        std::vector<cv::Mat> conv_blobs;
-                        conv_blobs.push_back(weightsBlob);
+                        // set conv/connected weights
+                        std::vector<cv::Mat> layer_blobs;
+                        layer_blobs.push_back(weightsBlob);
                        if (!use_batch_normalize) {
                            // use BIAS in any case
-                            conv_blobs.push_back(biasData_mat);
+                            layer_blobs.push_back(biasData_mat);
                        }
-                        setParams.setLayerBlobs(cv_layers_counter, conv_blobs);
+                        setParams.setLayerBlobs(cv_layers_counter, layer_blobs);

                        // set batch normalize (mean, variance, scale, bias)
                        if (use_batch_normalize) {
@ -782,7 +882,10 @@ namespace cv {
                    if(activation == "leaky")
                        ++cv_layers_counter;  // For ReLU

-                    current_channels = net->out_channels_vec[darknet_layers_counter];
+                    if(!darknet_layers_counter)
+                        tensor_shape.resize(1);
+
+                    tensor_shape[0] = net->out_channels_vec[darknet_layers_counter];
                }
                return true;
            }
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@ -12,7 +12,15 @@

 #ifdef HAVE_DNN_NGRAPH

+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4245)
+#pragma warning(disable : 4268)
+#endif
 #include <ngraph/ngraph.hpp>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif

 #endif  // HAVE_DNN_NGRAPH

--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -148,6 +148,7 @@ Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto)
        else
        {
            const char* val = tensor_proto.raw_data().c_str();
+#if CV_STRONG_ALIGNMENT
            // Aligned pointer is required: https://github.com/opencv/opencv/issues/16373
            // this doesn't work: typedef int64_t CV_DECL_ALIGNED(1) unaligned_int64_t;
            AutoBuffer<int64_t, 16> aligned_val;
@ -158,6 +159,7 @@ Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto)
                memcpy(aligned_val.data(), val, sz);
                val = (const char*)aligned_val.data();
            }
+#endif
            const int64_t* src = reinterpret_cast<const int64_t*>(val);
            convertInt64ToInt32(src, dst, blob.total());
        }
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1468,6 +1468,8 @@ void TFImporter::populateNet(Net dstNet)
            int end_mask = getLayerAttr(layer, "end_mask").i();
            for (int i = 0; i < num; ++i)
            {
+                if (ends.at<int>(i) < 0)
+                    ends.at<int>(i) -= 1;
                if (end_mask & (1 << i))
                    ends.at<int>(i) = -1;
                if (strides.at<int>(i) != 1)
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -486,7 +486,9 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
    if (backend == DNN_BACKEND_HALIDE)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

 #if defined(INF_ENGINE_RELEASE)
 #if INF_ENGINE_VER_MAJOR_LE(2018050000)
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -330,7 +330,9 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    }

    // There is something wrong with Reshape layer in Myriad plugin.
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019
+        || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+    )
    {
        if (targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_OPENCL_FP16)
            return;
@ -675,7 +677,10 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);

-    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
 #endif

--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -460,6 +460,9 @@ TEST_P(Test_Darknet_nets, YOLOv3)
 {
    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));

+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
    // batchId, classId, confidence, left, top, right, bottom
    Mat ref = (Mat_<float>(9, 7) << 0, 7,  0.952983f, 0.614622f, 0.150257f, 0.901369f, 0.289251f,  // a truck
                                    0, 1,  0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.74626f,   // a bicycle
@ -554,6 +557,11 @@ TEST_P(Test_Darknet_layers, reorg)
    testDarknetLayer("reorg");
 }

+TEST_P(Test_Darknet_layers, maxpool)
+{
+    testDarknetLayer("maxpool");
+}
+
 TEST_P(Test_Darknet_layers, convolutional)
 {
    if (target == DNN_TARGET_MYRIAD)
@ -563,6 +571,13 @@ TEST_P(Test_Darknet_layers, convolutional)
    testDarknetLayer("convolutional", true);
 }

+TEST_P(Test_Darknet_layers, connected)
+{
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    testDarknetLayer("connected", true);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -503,6 +503,9 @@ TEST_P(Async, create_layer_pipeline_set_and_forward_all)
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
    else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -196,20 +196,58 @@ TEST_P(Test_TensorFlow_layers, concat_axis_1)
    runTensorFlowNet("concat_axis_1");
 }

-TEST_P(Test_TensorFlow_layers, batch_norm)
+TEST_P(Test_TensorFlow_layers, batch_norm_1)
 {
    runTensorFlowNet("batch_norm");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_2)
+{
    runTensorFlowNet("batch_norm", false, 0.0, 0.0, true);
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_3)
+{
    runTensorFlowNet("fused_batch_norm");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_4)
+{
    runTensorFlowNet("fused_batch_norm", false, 0.0, 0.0, true);
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_5)
+{
    runTensorFlowNet("batch_norm_text", true);
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_6)
+{
    runTensorFlowNet("batch_norm_text", true, 0.0, 0.0, true);
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_7)
+{
    runTensorFlowNet("unfused_batch_norm");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_8)
+{
    runTensorFlowNet("fused_batch_norm_no_gamma");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_9)
+{
    runTensorFlowNet("unfused_batch_norm_no_gamma");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_10)
+{
    runTensorFlowNet("mvn_batch_norm");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_11)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    runTensorFlowNet("mvn_batch_norm_1x1");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_12)
+{
    runTensorFlowNet("switch_identity");
+}
+TEST_P(Test_TensorFlow_layers, batch_norm_13)
+{
    runTensorFlowNet("keras_batch_norm_training");
 }

@ -431,6 +469,8 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
                             CV_TEST_TAG_DNN_SKIP_IE_NGRAPH,
                         CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
    }
 #endif

@ -703,10 +743,15 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 #if defined(INF_ENGINE_RELEASE)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16 &&
-        INF_ENGINE_VER_MAJOR_EQ(2019020000))
+        (INF_ENGINE_VER_MAJOR_EQ(2019020000) || INF_ENGINE_VER_MAJOR_GE(2020010000))
+    )
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif

    checkBackend();
@ -843,6 +888,8 @@ TEST_P(Test_TensorFlow_layers, slice)
        (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
                     CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    double l1 = target == DNN_TARGET_MYRIAD ? 4.9e-3 : default_l1;
+    runTensorFlowNet("crop2d", false, l1);
    runTensorFlowNet("slice_4d");
    runTensorFlowNet("strided_slice");
 }
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -229,9 +229,14 @@ TEST_P(Test_Torch_layers, net_logsoftmax)
    runTorchNet("net_logsoftmax_spatial");
 }

-TEST_P(Test_Torch_layers, net_lp_pooling)
+TEST_P(Test_Torch_layers, net_lp_pooling_square)
 {
    runTorchNet("net_lp_pooling_square", "", false, true);
+}
+TEST_P(Test_Torch_layers, net_lp_pooling_power)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    runTorchNet("net_lp_pooling_power", "", false, true);
 }

@ -393,6 +398,10 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
        throw SkipTestException("");
    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#else
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU)
    {
        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
@ -400,12 +409,10 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
        throw SkipTestException("");
    }
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU)
+#endif
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
    {
-        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
-        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
-        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
-        throw SkipTestException("");
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    }

    Net net;
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@ -62,7 +62,7 @@ namespace cv

 //! Imread flags
 enum ImreadModes {
-       IMREAD_UNCHANGED            = -1, //!< If set, return the loaded image as is (with alpha channel, otherwise it gets cropped).
+       IMREAD_UNCHANGED            = -1, //!< If set, return the loaded image as is (with alpha channel, otherwise it gets cropped). Ignore EXIF orientation.
       IMREAD_GRAYSCALE            = 0,  //!< If set, always convert image to the single channel grayscale image (codec internal conversion).
       IMREAD_COLOR                = 1,  //!< If set, always convert image to the 3 channel BGR color image.
       IMREAD_ANYDEPTH             = 2,  //!< If set, return 16-bit/32-bit image when the input has the corresponding depth, otherwise convert it to 8-bit.
@ -172,8 +172,9 @@ Currently, the following file formats are supported:
    then the [GDAL](http://www.gdal.org) driver will be used in order to decode the image, supporting
    the following formats: [Raster](http://www.gdal.org/formats_list.html),
    [Vector](http://www.gdal.org/ogr_formats.html).
-   If EXIF information are embedded in the image file, the EXIF orientation will be taken into account
-    and thus the image will be rotated accordingly except if the flag @ref IMREAD_IGNORE_ORIENTATION is passed.
+-   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account
+    and thus the image will be rotated accordingly except if the flags @ref IMREAD_IGNORE_ORIENTATION
+    or @ref IMREAD_UNCHANGED are passed.
 -   Use the IMREAD_UNCHANGED flag to keep the floating point values from PFM image.
 -   By default number of pixels must be less than 2^30. Limit can be set using system
    variable OPENCV_IO_MAX_IMAGE_PIXELS
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -51,6 +51,8 @@
 #undef max
 #include <iostream>
 #include <fstream>
+#include <cerrno>
+#include <opencv2/core/utils/logger.hpp>
 #include <opencv2/core/utils/configuration.private.hpp>


@ -693,6 +695,23 @@ static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,
            code = encoder->write( write_vec[0], params );
        else
            code = encoder->writemulti( write_vec, params ); //to be implemented
+
+        if (!code)
+        {
+            FILE* f = fopen( filename.c_str(), "wb" );
+            if ( !f )
+            {
+                if (errno == EACCES)
+                {
+                    CV_LOG_WARNING(NULL, "imwrite_('" << filename << "'): can't open file for writing: permission denied");
+                }
+            }
+            else
+            {
+                fclose(f);
+                remove(filename.c_str());
+            }
+        }
    }
    catch (const cv::Exception& e)
    {
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -982,6 +982,14 @@ PyObject* pyopencv_from(const String& value)
    return PyString_FromString(value.empty() ? "" : value.c_str());
 }

+#if CV_VERSION_MAJOR == 3
+template<>
+PyObject* pyopencv_from(const std::string& value)
+{
+    return PyString_FromString(value.empty() ? "" : value.c_str());
+}
+#endif
+
 template<>
 bool pyopencv_to(PyObject* obj, String &value, const ArgInfo& info)
 {
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -146,7 +146,8 @@ enum VideoCaptureProperties {
       CAP_PROP_HUE           =13, //!< Hue of the image (only for cameras).
       CAP_PROP_GAIN          =14, //!< Gain of the image (only for those cameras that support).
       CAP_PROP_EXPOSURE      =15, //!< Exposure (only for those cameras that support).
-       CAP_PROP_CONVERT_RGB   =16, //!< Boolean flags indicating whether images should be converted to RGB.
+       CAP_PROP_CONVERT_RGB   =16, //!< Boolean flags indicating whether images should be converted to RGB. <br/>
+                                   //!< *GStreamer note*: The flag is ignored in case if custom pipeline is used. It's user responsibility to interpret pipeline output.
       CAP_PROP_WHITE_BALANCE_BLUE_U =17, //!< Currently unsupported.
       CAP_PROP_RECTIFICATION =18, //!< Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently).
       CAP_PROP_MONOCHROME    =19,
@ -631,7 +632,8 @@ public:
    @param filename it can be:
    - name of video file (eg. `video.avi`)
    - or image sequence (eg. `img_%02d.jpg`, which will read samples like `img_00.jpg, img_01.jpg, img_02.jpg, ...`)
-    - or URL of video stream (eg. `protocol://host:port/script_name?script_params|auth`).
+    - or URL of video stream (eg. `protocol://host:port/script_name?script_params|auth`)
+    - or GStreamer pipeline string in gst-launch tool format in case if GStreamer is used as backend
      Note that each video stream or IP camera feed has its own URL scheme. Please refer to the
      documentation of source stream to know the right URL.
    @param apiPreference preferred Capture API backends to use. Can be used to enforce a specific reader
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@ -119,6 +119,8 @@ public:
            for (int k = 0; k < n_frames; ++k)
            {
                checkFrameRead(k, cap);
+                if (::testing::Test::HasFailure() && k % 10 == 0)
+                    break;
            }
        }
        bool canSeek = false;
@ -138,6 +140,8 @@ public:
            for (int k = 0; k < n_frames; k += 20)
            {
                checkFrameSeek(k, cap);
+                if (::testing::Test::HasFailure() && k % 10 == 0)
+                    break;
            }
        }

@ -150,6 +154,8 @@ public:
            for (int k = 0; k < 10; ++k)
            {
                checkFrameSeek(cvtest::TS::ptr()->get_rng().uniform(0, n_frames), cap);
+                if (::testing::Test::HasFailure() && k % 10 == 0)
+                    break;
            }
        }
    }
@ -217,6 +223,8 @@ public:
            EXPECT_EQ(bunny_param.getWidth(), frame.cols);
            EXPECT_EQ(bunny_param.getHeight(), frame.rows);
            count_actual += 1;
+            if (::testing::Test::HasFailure() && count_actual % 10 == 0)
+                break;
        }
        if (count_prop > 0)
        {
@ -272,6 +280,8 @@ public:
        {
            generateFrame(i, frame_count, img);
            EXPECT_NO_THROW(writer << img);
+            if (::testing::Test::HasFailure() && i % 10 == 0)
+                break;
        }
        EXPECT_NO_THROW(writer.release());
    }
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -45,7 +45,7 @@ if(INSTALL_PYTHON_EXAMPLES)
  add_subdirectory(python)
 endif()

-ocv_install_example_src("." CMakeLists.txt)
+ocv_install_example_src("." CMakeLists.txt samples_utils.cmake)
 if(INSTALL_C_EXAMPLES)
  install(DIRECTORY data DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}" COMPONENT samples_data)
 endif()