diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8a3be499e..9b4ac4c351 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1047,7 +1047,9 @@ endif()
 if(CMAKE_GENERATOR MATCHES Xcode)
   status("    Xcode:"          ${XCODE_VERSION})
 endif()
-if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
+if(CMAKE_GENERATOR MATCHES "Xcode|Visual Studio|Multi-Config")
+  status("    Configuration:"  ${CMAKE_CONFIGURATION_TYPES})
+else()
   status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
 
diff --git a/README.md b/README.md
index 0653a9e73e..b9897205ba 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,8 @@
 * Homepage: <https://opencv.org>
   * Courses: <https://opencv.org/courses>
 * Docs: <https://docs.opencv.org/master/>
-* Q&A forum: <http://answers.opencv.org>
+* Q&A forum: <https://forum.opencv.org>
+  * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
 * Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib> 
 
diff --git a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
index ad2bb54c48..b9f6871ec0 100644
--- a/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
+++ b/doc/js_tutorials/js_assets/js_template_matching_matchTemplate.html
@@ -74,7 +74,8 @@ let utils = new Utils('errorMessage');
 utils.loadCode('codeSnippet', 'codeEditor');
 utils.loadImageToCanvas('lena.jpg', 'imageCanvasInput');
 utils.loadImageToCanvas('lenaFace.png', 'templateCanvasInput');
-utils.addFileInputHandler('fileInput', 'canvasInput');
+utils.addFileInputHandler('fileInput', 'imageCanvasInput');
+utils.addFileInputHandler('templateFileInput', 'templateCanvasInput');
 
 let tryIt = document.getElementById('tryIt');
 tryIt.addEventListener('click', () => {
diff --git a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
index e2191e6d41..5a8c3b87fa 100644
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@@ -82,7 +82,7 @@ In this tutorial, we just show a cv.Mat on screen. To show a cv.Mat, you need a
 
 You can use cv.imshow to show cv.Mat on the canvas.
 @code{.js}
-cv.imshow(mat, "outputCanvas");
+cv.imshow("outputCanvas", mat);
 @endcode
 
 Putting all of the steps together, the final index.html is shown below.
diff --git a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
index 656f5423c5..dee4df774a 100644
--- a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
+++ b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
@@ -20,10 +20,10 @@ scale invariant.
 
 ![image](images/sift_scale_invariant.jpg)
 
-So, in 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
+In 2004, **D.Lowe**, University of British Columbia, came up with a new algorithm, Scale
 Invariant Feature Transform (SIFT) in his paper, **Distinctive Image Features from Scale-Invariant
 Keypoints**, which extract keypoints and compute its descriptors. *(This paper is easy to understand
-and considered to be best material available on SIFT. So this explanation is just a short summary of
+and considered to be best material available on SIFT. This explanation is just a short summary of
 this paper)*.
 
 There are mainly four steps involved in SIFT algorithm. We will see them one-by-one.
@@ -102,16 +102,17 @@ reasons. In that case, ratio of closest-distance to second-closest distance is t
 greater than 0.8, they are rejected. It eliminates around 90% of false matches while discards only
 5% correct matches, as per the paper.
 
-So this is a summary of SIFT algorithm. For more details and understanding, reading the original
-paper is highly recommended. Remember one thing, this algorithm is patented. So this algorithm is
-included in [the opencv contrib repo](https://github.com/opencv/opencv_contrib)
+This is a summary of SIFT algorithm. For more details and understanding, reading the original
+paper is highly recommended.
 
 SIFT in OpenCV
 --------------
 
-So now let's see SIFT functionalities available in OpenCV. Let's start with keypoint detection and
-draw them. First we have to construct a SIFT object. We can pass different parameters to it which
-are optional and they are well explained in docs.
+Now let's see SIFT functionalities available in OpenCV. Note that these were previously only
+available in [the opencv contrib repo](https://github.com/opencv/opencv_contrib), but the patent
+expired in the year 2020. So they are now included in the main repo. Let's start with keypoint
+detection and draw them. First we have to construct a SIFT object. We can pass different
+parameters to it which are optional and they are well explained in docs.
 @code{.py}
 import numpy as np
 import cv2 as cv
diff --git a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
index 496b956aed..5edff16879 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@@ -224,7 +224,7 @@ First you apply the transform:
         -   *theta*: The resolution of the parameter \f$\theta\f$ in radians. We use **1 degree**
             (CV_PI/180)
         -   *threshold*: The minimum number of intersections to "*detect*" a line
-        -   *minLinLength*: The minimum number of points that can form a line. Lines with less than
+        -   *minLineLength*: The minimum number of points that can form a line. Lines with less than
             this number of points are disregarded.
         -   *maxLineGap*: The maximum gap between two points to be considered in the same line.
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index 22c4a34e52..2f835bb9f8 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -257,221 +257,20 @@ struct v_float64x2
     v128_t val;
 };
 
-namespace fallback
-{
-
-template<typename _Tp, int n> struct v_reg
-{
-    typedef _Tp lane_type;
-    enum { nlanes = n };
-
-    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
-
-    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-    }
-
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
-           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
-           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
-        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
-    }
-
-    v_reg() {}
-
-    v_reg(const v_reg<_Tp, n> & r)
-    {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-    }
-
-    _Tp get0() const { return s[0]; }
-
-    _Tp get(const int i) const { return s[i]; }
-    v_reg<_Tp, n> high() const
-    {
-        v_reg<_Tp, n> c;
-        int i;
-        for( i = 0; i < n/2; i++ )
-        {
-            c.s[i] = s[i+(n/2)];
-            c.s[i+(n/2)] = 0;
-        }
-        return c;
-    }
-
-    static v_reg<_Tp, n> zero()
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = (_Tp)0;
-        return c;
-    }
-
-    static v_reg<_Tp, n> all(_Tp s)
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = s;
-        return c;
-    }
-
-    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
-    {
-        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
-        v_reg<_Tp2, n2> c;
-        std::memcpy(&c.s[0], &s[0], bytes);
-        return c;
-    }
-
-    v_reg(const cv::v_uint8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int8x16& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int16x8& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float32x4& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_float64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_uint64x2& v) { wasm_v128_store(&s, v.val); }
-    v_reg(const cv::v_int64x2& v) { wasm_v128_store(&s, v.val); }
-
-    operator cv::v_uint8x16() const { return cv::v_uint8x16(wasm_v128_load(&s)); }
-    operator cv::v_int8x16() const { return cv::v_int8x16(wasm_v128_load(&s)); }
-    operator cv::v_uint16x8() const { return cv::v_uint16x8(wasm_v128_load(&s)); }
-    operator cv::v_int16x8() const { return cv::v_int16x8(wasm_v128_load(&s)); }
-    operator cv::v_uint32x4() const { return cv::v_uint32x4(wasm_v128_load(&s)); }
-    operator cv::v_int32x4() const { return cv::v_int32x4(wasm_v128_load(&s)); }
-    operator cv::v_float32x4() const { return cv::v_float32x4(wasm_v128_load(&s)); }
-    operator cv::v_float64x2() const { return cv::v_float64x2(wasm_v128_load(&s)); }
-    operator cv::v_uint64x2() const { return cv::v_uint64x2(wasm_v128_load(&s)); }
-    operator cv::v_int64x2() const { return cv::v_int64x2(wasm_v128_load(&s)); }
-
-    _Tp s[n];
-};
-
-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<uint64, 2> v_uint64x2;
-typedef v_reg<int64, 2> v_int64x2;
-
-#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> \
-    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& \
-    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIN_OP(+)
-OPENCV_HAL_IMPL_BIN_OP(-)
-OPENCV_HAL_IMPL_BIN_OP(*)
-OPENCV_HAL_IMPL_BIN_OP(/)
-
-#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
-    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
-    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIT_OP(&)
-OPENCV_HAL_IMPL_BIT_OP(|)
-OPENCV_HAL_IMPL_BIT_OP(^)
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
-    }
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp2, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
-                          typename V_TypeTraits<_Tp>::abs_type)
-OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
-
-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i], b.s[i]); \
-    return c; \
-}
-
-#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
-template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
-{ \
-    _Tp c = a.s[0]; \
-    for( int i = 1; i < n; i++ ) \
-        c = cfunc(c, a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
-OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+namespace
+{
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
 
 static const unsigned char popCountTable[] =
 {
@@ -492,1184 +291,7 @@ static const unsigned char popCountTable[] =
     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
 };
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
-    for (int i = 0; i < (int)(n*sizeof(_Tp)); i++)
-        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        minval.s[i] = std::min(a.s[i], b.s[i]);
-        maxval.s[i] = std::max(a.s[i], b.s[i]);
-    }
-}
-
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_CMP_OP(<)
-OPENCV_HAL_IMPL_CMP_OP(>)
-OPENCV_HAL_IMPL_CMP_OP(<=)
-OPENCV_HAL_IMPL_CMP_OP(>=)
-OPENCV_HAL_IMPL_CMP_OP(==)
-OPENCV_HAL_IMPL_CMP_OP(!=)
-
-template<int n>
-inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
-{
-    typedef typename V_TypeTraits<float>::int_type itype;
-    v_reg<float, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-template<int n>
-inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
-{
-    typedef typename V_TypeTraits<double>::int_type itype;
-    v_reg<double, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
-template<typename _Tp, int n> \
-inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef _Tp2 rtype; \
-    v_reg<rtype, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
-
-template<typename T> inline T _absdiff(T a, T b)
-{
-    return a > b ? a - b : b - a;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
-{
-    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
-    v_reg<rtype, n> c;
-    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
-    for( int i = 0; i < n; i++ )
-    {
-        rtype ua = a.s[i] ^ mask;
-        rtype ub = b.s[i] ^ mask;
-        c.s[i] = _absdiff(ua, ub);
-    }
-    return c;
-}
-
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{
-    v_float64x2 c;
-    for( int i = 0; i < c.nlanes; i++ )
-        c.s[i] = _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++)
-        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = 1.f/std::sqrt(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                           const v_reg<_Tp, n>& c)
-{
-    v_reg<_Tp, n> d;
-    for( int i = 0; i < n; i++ )
-        d.s[i] = a.s[i]*b.s[i] + c.s[i];
-    return d;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
-{
-    return v_fma(a, b, c);
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> c;
-    for( int i = 0; i < (n/2); i++ )
-        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n/2> s;
-    for( int i = 0; i < (n/2); i++ )
-        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
-    return s;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
-    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                     const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, n/4> s;
-    for( int i = 0; i < (n/4); i++ )
-        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
-                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
-    return s;
-}
-
-template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
-                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i]*b.s[i];
-        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
-                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
-    }
-}
-
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
-
-#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp, n> b; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int sIndex = i opA imm; \
-        if (0 <= sIndex && sIndex < n) \
-        { \
-            b.s[i] = a.s[sIndex]; \
-        } \
-        else \
-        { \
-            b.s[i] = 0; \
-        } \
-    } \
-    return b; \
-} \
-template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for (int i = 0; i < n; i++) \
-    { \
-        int aIndex = i opA imm; \
-        int bIndex = i opA imm opB n; \
-        if (0 <= bIndex && bIndex < n) \
-        { \
-            c.s[i] = b.s[bIndex]; \
-        } \
-        else if (0 <= aIndex && aIndex < n) \
-        { \
-            c.s[i] = a.s[aIndex]; \
-        } \
-        else \
-        { \
-            c.s[i] = 0; \
-        } \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
-OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
-
-template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
-{
-    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
-    for( int i = 1; i < n; i++ )
-        c += a.s[i];
-    return c;
-}
-
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
-{
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
-    return r;
-}
-
-template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
-    for (int i = 1; i < n; i++)
-        c += _absdiff(a.s[i], b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
-{
-    int mask = 0;
-    for( int i = 0; i < n; i++ )
-        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
-    return mask;
-}
-
-template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
-            return false;
-    return true;
-}
-
-template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
-            return true;
-    return false;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
-                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    typedef V_TypeTraits<_Tp> Traits;
-    typedef typename Traits::int_type int_type;
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        int_type m = Traits::reinterpret_int(mask.s[i]);
-        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
-        c.s[i] = m ? a.s[i] : b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
-                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        b0.s[i] = a.s[i];
-        b1.s[i] = a.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_low(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i];
-    return b;
-}
-
-template<typename _Tp, int n>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-v_expand_high(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
-    for( int i = 0; i < (n/2); i++ )
-        b.s[i] = a.s[i+(n/2)];
-    return b;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
-                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
-{
-    int i;
-    for( i = 0; i < n/2; i++ )
-    {
-        b0.s[i*2] = a0.s[i];
-        b0.s[i*2+1] = a1.s[i];
-    }
-    for( ; i < n; i++ )
-    {
-        b1.s[i*2-n] = a0.s[i];
-        b1.s[i*2-n+1] = a1.s[i];
-    }
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
-{
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for( int i = 0; i < c.nlanes/2; i++ )
-    {
-        c.s[i] = loptr[i];
-        c.s[i+c.nlanes/2] = hiptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
-v_load_expand(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
-v_load_expand_q(const _Tp* ptr)
-{
-    typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
-    for( int i = 0; i < c.nlanes; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        a.s[i] = ptr[i2];
-        b.s[i] = ptr[i2+1];
-    }
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        a.s[i] = ptr[i3];
-        b.s[i] = ptr[i3+1];
-        c.s[i] = ptr[i3+2];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
-                                v_reg<_Tp, n>& d)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        a.s[i] = ptr[i4];
-        b.s[i] = ptr[i4+1];
-        c.s[i] = ptr[i4+2];
-        d.s[i] = ptr[i4+3];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b,
-                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i2;
-    for( i = i2 = 0; i < n; i++, i2 += 2 )
-    {
-        ptr[i2] = a.s[i];
-        ptr[i2+1] = b.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        ptr[i3] = a.s[i];
-        ptr[i3+1] = b.s[i];
-        ptr[i3+2] = c.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d,
-                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        ptr[i4] = a.s[i];
-        ptr[i4+1] = b.s[i];
-        ptr[i4+2] = c.s[i];
-        ptr[i4+3] = d.s[i];
-    }
-}
-
-template<typename _Tp, int n>
-inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i+(n/2)];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i];
-        c.s[i+(n/2)] = b.s[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i+(n/2)];
-        c.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-    return c;
-}
-
-template<typename _Tp, int n>
-inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        low.s[i] = a.s[i];
-        low.s[i+(n/2)] = b.s[i];
-        high.s[i] = a.s[i+(n/2)];
-        high.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<int s, typename _Tp, int n>
-inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> r;
-    const int shift = n - s;
-    int i = 0;
-    for (; i < shift; ++i)
-        r.s[i] = a.s[i+s];
-    for (; i < n; ++i)
-        r.s[i] = b.s[i-shift];
-    return r;
-}
-
-template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvRound(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = cvRound(b.s[i]);
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvFloor(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvCeil(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (int)(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvFloor(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (int)(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (float)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
-{
-    v_reg<float, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = (float)a.s[i];
-        c.s[i+n] = (float)b.s[i];
-    }
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i+2];
-    return c;
-}
-
-inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-    v_float64x2 c;
-    for( int i = 0; i < 2; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i]];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 2] + i % 2];
-    return c;
-}
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
-{
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
-        c.s[i] = tab[idx[i / 4] + i % 4];
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
-{
-    v_reg<int, n> c;
-    for (int i = 0; i < n; i++)
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = tab[idx.s[i]];
-    return c;
-}
-
-template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
-                                               v_reg<float, n>& x, v_reg<float, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
-                                               v_reg<double, n>& x, v_reg<double, n>& y)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        int j = idx.s[i];
-        x.s[i] = tab[j];
-        y.s[i] = tab[j+1];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[4*i  ] = vec.s[4*i  ];
-        c.s[4*i+1] = vec.s[4*i+2];
-        c.s[4*i+2] = vec.s[4*i+1];
-        c.s[4*i+3] = vec.s[4*i+3];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/8; i++)
-    {
-        c.s[8*i  ] = vec.s[8*i  ];
-        c.s[8*i+1] = vec.s[8*i+4];
-        c.s[8*i+2] = vec.s[8*i+1];
-        c.s[8*i+3] = vec.s[8*i+5];
-        c.s[8*i+4] = vec.s[8*i+2];
-        c.s[8*i+5] = vec.s[8*i+6];
-        c.s[8*i+6] = vec.s[8*i+3];
-        c.s[8*i+7] = vec.s[8*i+7];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
-{
-    v_reg<_Tp, n> c;
-    for (int i = 0; i < n/4; i++)
-    {
-        c.s[3*i  ] = vec.s[4*i  ];
-        c.s[3*i+1] = vec.s[4*i+1];
-        c.s[3*i+2] = vec.s[4*i+2];
-    }
-    return c;
-}
-
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
-{
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
-}
-
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
-
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
-
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
-    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
-
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
-
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
-
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
-
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-    { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-    } \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>(a.s[i]); \
-}
-
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
-{ \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-}
-
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
-
-template<typename _Tpm, typename _Tp, int n>
-inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    for (int i = 0; i < n; ++i)
-    {
-        mptr[i] = (_Tpm)a.s[i];
-        mptr[i + n] = (_Tpm)b.s[i];
-    }
-}
-
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
-    return mask;
-}
-
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
-{
-    v_uint8x16 mask;
-    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
-    return mask;
-}
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
-}
-
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
-}
-
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
-v_load_expand(const float16_t* ptr)
-{
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        v.s[i] = ptr[i];
-    }
-    return v;
-}
-
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
-{
-    for( int i = 0; i < v.nlanes; i++ )
-    {
-        ptr[i] = float16_t(v.s[i]);
-    }
-}
-
-inline void v_cleanup() {}
-}  // namespace fallback
+}  // namespace
 
 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
     return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
@@ -2554,14 +1176,6 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
     return _Tpvec(intrin(a.val, b.val)); \
 }
 
-#define OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(_Tpvec, func, intrin) \
-inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    fallback::_Tpvec a_(a); \
-    fallback::_Tpvec b_(b); \
-    return _Tpvec(fallback::func(a_, b_)); \
-}
-
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
@@ -2654,8 +1268,24 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (2000000)
 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
-OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
-OPENCV_HAL_IMPL_WASM_BIN_FUNC_FALLBACK(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return wasm_v128_load(a_);
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return wasm_v128_load(a_);
+}
 #else
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
@@ -2919,13 +1549,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_low(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
 } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    fallback::v_store_high(ptr, a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
 }
 
 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
@@ -2991,8 +1625,12 @@ OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_reduce_sum(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
 }
 
 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
@@ -3116,8 +1754,11 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
-    fallback::v_uint64x2 a_(a);
-    return fallback::v_popcount(a_);
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8*)a_)[i]];
+    return wasm_v128_load(b_);
 }
 inline v_uint8x16 v_popcount(const v_int8x16& a)
 { return v_popcount(v_reinterpret_as_u8(a)); }
@@ -3131,8 +1772,12 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
 inline int v_signmask(const _Tpvec& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::v_signmask(a_); \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
 } \
 inline bool v_check_all(const _Tpvec& a) \
 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
@@ -3287,22 +1932,35 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
 
-#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc, _Tpvec, _Tpnvec, _Tp, _Tpn) \
-inline _Tpnvec func(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
 { \
-    fallback::_Tpvec a_(a); \
-    return fallback::func(a_); \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[i]); \
+    c_[1] = cfunc(a_[i]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return wasm_v128_load(c_); \
 }
 
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil, v_float64x2, v_int32x4, double, int)
-OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int, v_float64x2, v_int32x4, double, int)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_round(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return wasm_v128_load(c_);
 }
 
 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
@@ -3796,14 +2454,27 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
-    fallback::v_float64x2 a_(a);
-    return fallback::v_cvt_f32(a_);
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return wasm_v128_load(c_);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
-    fallback::v_float64x2 a_(a), b_(b);
-    return fallback::v_cvt_f32(a_, b_);
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
@@ -3812,8 +2483,12 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 #endif
 }
 
@@ -3823,21 +2498,33 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
     v128_t p = v128_cvti32x4_i64x2_high(a.val);
     return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
-    fallback::v_int32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return wasm_v128_load(c_);
 #endif
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    fallback::v_float32x4 a_(a);
-    return fallback::v_cvt_f64_high(a_);
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return wasm_v128_load(c_);
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -3845,8 +2532,12 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #ifdef __wasm_unimplemented_simd128__
     return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
 #else
-    fallback::v_int64x2 a_(a);
-    return fallback::v_cvt_f64(a_);
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return wasm_v128_load(c_);
 #endif
 }
 
@@ -4063,13 +2754,20 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    return fallback::v_load_expand(ptr);
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return wasm_v128_load(a);
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    fallback::v_float32x4 v_(v);
-    fallback::v_pack_store(ptr, v_);
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = float16_t(v_[0]);
+    ptr[1] = float16_t(v_[1]);
+    ptr[2] = float16_t(v_[2]);
+    ptr[3] = float16_t(v_[3]);
 }
 
 inline void v_cleanup() {}
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 679055de4a..e704911c12 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -11,6 +11,11 @@ Implementation of Tensorflow models parser
 
 #include "../precomp.hpp"
 
+#include <opencv2/core/utils/logger.defines.hpp>
+#undef CV_LOG_STRIP_LEVEL
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_PROTOBUF
 #include "tf_io.hpp"
 
@@ -93,7 +98,7 @@ void blobShapeFromTensor(const tensorflow::TensorProto &tensor, MatShape& shape)
                 shape[i] = (int)_shape.dim(i).size();
         }
         else
-            shape.resize(1, 1);  // Scalar.
+            shape.resize(1, 1);  // Scalar. // FIXIT: should be empty
     }
     else
     {
@@ -258,7 +263,7 @@ const tensorflow::AttrValue& getLayerAttr(const tensorflow::NodeDef &layer, cons
     return layer.attr().at(name);
 }
 
-static int getDataLayout(const tensorflow::NodeDef& layer)
+static DataLayout getDataLayout(const tensorflow::NodeDef& layer)
 {
     if (hasLayerAttr(layer, "data_format"))
     {
@@ -280,10 +285,13 @@ static inline std::string getNodeName(const std::string& tensorName)
     return tensorName.substr(0, tensorName.rfind(':'));
 }
 
-static inline int getDataLayout(const std::string& layerName,
-                                const std::map<String, int>& data_layouts)
+static inline
+DataLayout getDataLayout(
+        const std::string& layerName,
+        const std::map<String, DataLayout>& data_layouts
+)
 {
-    std::map<String, int>::const_iterator it = data_layouts.find(getNodeName(layerName));
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layerName));
     return it != data_layouts.end() ? it->second : DATA_LAYOUT_UNKNOWN;
 }
 
@@ -439,15 +447,20 @@ void ExcludeLayer(tensorflow::GraphDef& net, const int layer_index, const int in
         net.mutable_node()->DeleteSubrange(layer_index, 1);
 }
 
-class TFImporter {
+class TFImporter
+{
 public:
-    TFImporter(const char *model, const char *config = NULL);
-    TFImporter(const char *dataModel, size_t lenModel,
+    TFImporter(Net& net, const char *model, const char *config = NULL);
+    TFImporter(Net& net, const char *dataModel, size_t lenModel,
                const char *dataConfig = NULL, size_t lenConfig = 0);
+protected:
+    Net& dstNet;
+    void populateNet();
 
-    void populateNet(Net dstNet);
+    void parseNode(const tensorflow::NodeDef& layer);
+
+    DataLayout predictOutputDataLayout(const tensorflow::NodeDef& layer);
 
-private:
     void kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob);
 
     void connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
@@ -467,23 +480,53 @@ private:
 
     std::vector<String> netInputsNames;
     std::vector<MatShape> netInputShapes;
+
+    std::set<String> layers_to_ignore;
+    std::map<String, DataLayout> data_layouts;
+
+    // find all Const layers for params
+    std::map<String, int> value_id;
+    // A map with constant blobs which are shared between multiple layers.
+    std::map<String, Mat> sharedWeights;
+
+    std::map<String, int> layer_id;
 };
 
-TFImporter::TFImporter(const char *model, const char *config)
+TFImporter::TFImporter(Net& net, const char *model, const char *config)
+    : dstNet(net)
 {
     if (model && model[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from file: " << model);
         ReadTFNetParamsFromBinaryFileOrDie(model, &netBin);
+    }
     if (config && config[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from file: " << config);
         ReadTFNetParamsFromTextFileOrDie(config, &netTxt);
+    }
+
+    populateNet();
 }
 
-TFImporter::TFImporter(const char *dataModel, size_t lenModel,
-                       const char *dataConfig, size_t lenConfig)
+TFImporter::TFImporter(
+        Net& net,
+        const char *dataModel, size_t lenModel,
+        const char *dataConfig, size_t lenConfig
+)
+    : dstNet(net)
 {
     if (dataModel != NULL && lenModel > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from memory (" << lenModel << " bytes)");
         ReadTFNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBin);
+    }
     if (dataConfig != NULL && lenConfig > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from memory (" << lenConfig << " bytes)");
         ReadTFNetParamsFromTextBufferOrDie(dataConfig, lenConfig, &netTxt);
+    }
+    populateNet();
 }
 
 void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
@@ -612,84 +655,98 @@ const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDe
 static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& const_layers,
                           std::set<String>& layers_to_ignore)
 {
+    CV_LOG_DEBUG(NULL, "DNN/TF: addConstNodes(): handling " << net.node_size() << " nodes...");
     for (int li = 0; li < net.node_size(); li++)
     {
         const tensorflow::NodeDef &layer = net.node(li);
         String name = layer.name();
         String type = layer.op();
 
-        if (type == "Dequantize")
+        //CV_LOG_DEBUG(NULL, "DNN/TF: layer_id=" << li << " - '" << name << "' @ " << type);
+
+        try
         {
-            // Example of Dequantize node:
-            //   name: "conv2d_1/bias"
-            //   op: "Dequantize"
-            //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
-            //   input: "conv2d_1/bias_quantized_min"
-            //   input: "conv2d_1/bias_quantized_max"
-            //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
-            //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
-            CV_Assert(layer.input_size() == 3);
-            for (int i = 0; i < 3; ++i)
-                CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
-            CV_Assert(hasLayerAttr(layer, "mode") &&
-                      getLayerAttr(layer, "mode").s() == "MIN_FIRST");
-
-            int tensorId = const_layers[layer.input(0)];
-            int minId = const_layers[layer.input(1)];
-            int maxId = const_layers[layer.input(2)];
-
-            tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
-                                                ->mutable_attr()->at("value")
-                                                 .mutable_tensor();
-            CV_Assert(tensor->dtype() == tensorflow::DT_QUINT8);
-
-            Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
-            Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
-            CV_Assert_N(qMin.total() == 1, qMin.type() == CV_32FC1,
-                        qMax.total() == 1, qMax.type() == CV_32FC1);
-
-            Mat content = getTensorContent(*tensor);
-
-            float minVal = qMin.at<float>(0);
-            float rangeScale = (qMax.at<float>(0) - minVal) / 255;
-            CV_Assert(rangeScale >= 0);
-            content.convertTo(content, CV_32FC1, rangeScale,
-                              rangeScale * cvRound(minVal / rangeScale));
-
-            tensor->set_dtype(tensorflow::DT_FLOAT);
-            tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
-
-            net.mutable_node(tensorId)->set_name(name);
-            CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+            if (type == "Dequantize")
+            {
+                // Example of Dequantize node:
+                //   name: "conv2d_1/bias"
+                //   op: "Dequantize"
+                //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
+                //   input: "conv2d_1/bias_quantized_min"
+                //   input: "conv2d_1/bias_quantized_max"
+                //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
+                //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
+                CV_CheckEQ(layer.input_size(), 3, "Dequantize: 3 inputs is supported only");
+                for (int i = 0; i < 3; ++i)
+                    CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
+                CV_Assert(hasLayerAttr(layer, "mode") &&
+                          getLayerAttr(layer, "mode").s() == "MIN_FIRST");
+
+                int tensorId = const_layers[layer.input(0)];
+                int minId = const_layers[layer.input(1)];
+                int maxId = const_layers[layer.input(2)];
+
+                tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
+                                                    ->mutable_attr()->at("value")
+                                                     .mutable_tensor();
+                CV_CheckEQ((int)tensor->dtype(), (int)tensorflow::DT_QUINT8, "");
+
+                Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
+                Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
+                CV_CheckEQ(qMin.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMin.type(), CV_32FC1, "");
+                CV_CheckEQ(qMax.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMax.type(), CV_32FC1, "");
+
+                Mat content = getTensorContent(*tensor);
+
+                float minVal = qMin.at<float>(0);
+                float rangeScale = (qMax.at<float>(0) - minVal) / 255;
+                CV_Assert(rangeScale >= 0);
+                content.convertTo(content, CV_32FC1, rangeScale,
+                                  rangeScale * cvRound(minVal / rangeScale));
+
+                tensor->set_dtype(tensorflow::DT_FLOAT);
+                tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
+
+                net.mutable_node(tensorId)->set_name(name);
+                CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+                layers_to_ignore.insert(name);
+                continue;
+            }
+            else if (type != "Const")
+                continue;  // only Const parameters are supported
+
+            if (layer.attr().find("value") != layer.attr().end())
+            {
+                CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            }
             layers_to_ignore.insert(name);
-            continue;
         }
-        else if (type != "Const")
-            continue;  // only Const parameters are supported
-
-        if (layer.attr().find("value") != layer.attr().end())
+        catch (const std::exception& e)
         {
-            CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't handle node='" << name << "'. Exception: " << e.what());
+            throw;
         }
-        layers_to_ignore.insert(name);
     }
+    CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
 }
 
 // If all inputs of specific layer have the same data layout we can say that
 // this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
-static int predictOutputDataLayout(const tensorflow::GraphDef& net,
-                                   const tensorflow::NodeDef& layer,
-                                   const std::map<String, int>& data_layouts)
+DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
 {
-    int layout = getDataLayout(layer);
+    DataLayout layout = getDataLayout(layer);
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from attrs)");
         return layout;
+    }
 
     // Determine layout by layer's inputs
-    std::map<String, int>::const_iterator it;
     for (int i = 0, n = layer.input_size(); i < n; ++i)
     {
-        it = data_layouts.find(getNodeName(layer.input(i)));
+        std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layer.input(i)));
         if (it != data_layouts.end())
         {
             if (layout != DATA_LAYOUT_UNKNOWN)
@@ -703,71 +760,72 @@ static int predictOutputDataLayout(const tensorflow::GraphDef& net,
     }
 
     if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from inputs)");
         return layout;
+    }
 
     // Determine layout by layer's consumers recursively.
-    it = data_layouts.find(layer.name());
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(layer.name());
     CV_Assert(it != data_layouts.end());
     return it->second;
 }
 
-void TFImporter::populateNet(Net dstNet)
+void TFImporter::populateNet()
 {
-    if (!netTxt.ByteSize())
-        removePhaseSwitches(netBin);
+    CV_Assert(netBin.ByteSize() || netTxt.ByteSize());
 
-    RemoveIdentityOps(netBin);
-    RemoveIdentityOps(netTxt);
+    CV_LOG_INFO(NULL, "DNN/TF: parsing model"
+        << (netBin.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netBin.versions().producer(), (int)netBin.versions().min_consumer()) : cv::String(" (N/A version info)"))
+        << ". Number of nodes = " << netBin.node_size()
+    );
 
-    if (!netTxt.ByteSize())
+    if (netTxt.ByteSize())
     {
-        simplifySubgraphs(netBin);
-        sortByExecutionOrder(netBin);
+        CV_LOG_INFO(NULL, "DNN/TF: parsing config"
+            << (netTxt.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netTxt.versions().producer(), (int)netTxt.versions().min_consumer()) : cv::String(" (N/A version info)"))
+            << ". Number of nodes = " << netTxt.node_size()
+        );
+
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+        RemoveIdentityOps(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(config) => " << netTxt.node_size() << " nodes");
+
+        sortByExecutionOrder(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(config) => " << netTxt.node_size() << " nodes");
     }
     else
     {
-        sortByExecutionOrder(netTxt);
-    }
+        removePhaseSwitches(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: removePhaseSwitches(model) => " << netBin.node_size() << " nodes");
 
-    std::set<String> layers_to_ignore;
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+
+        simplifySubgraphs(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: simplifySubgraphs(model) => " << netBin.node_size() << " nodes");
+        sortByExecutionOrder(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(model) => " << netBin.node_size() << " nodes");
+    }
 
     tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
 
     int layersSize = net.node_size();
 
-    std::map<String, int> data_layouts;
     // Pre-fill data layouts where they are set explicitly.
     // Assuming that nodes are in topological order
-    for (int i = net.node_size() - 1; i >= 0; --i)
+    for (int i = layersSize - 1; i >= 0; --i)
     {
         const tensorflow::NodeDef& layer = net.node(i);
         std::string name = layer.name();
 
-        int layout = getDataLayout(layer);
-        std::map<String, int>::iterator it = data_layouts.find(name);
-        if (it != data_layouts.end())
-        {
-            if (layout != DATA_LAYOUT_UNKNOWN)
-            {
-                if (it->second == DATA_LAYOUT_UNKNOWN)
-                    it->second = layout;
-                else if (it->second != layout)
-                {
-                    it->second = DATA_LAYOUT_UNKNOWN;
-                    layout = DATA_LAYOUT_UNKNOWN;
-                }
-            }
-            else
-                layout = it->second;
-        }
-        else
-            data_layouts[name] = layout;
+        CV_LOG_DEBUG(NULL, "DNN/TF: node(" << i << " - '" << name << "') propagating layout...");
 
-        // Specify input layers to have the same data layout.
-        for (int j = 0; j < layer.input_size(); ++j)
+        try
         {
-            name = getNodeName(layer.input(j));
-            it = data_layouts.find(name);
+            DataLayout layout = getDataLayout(layer);
+            std::map<String, DataLayout>::iterator it = data_layouts.find(name);
             if (it != data_layouts.end())
             {
                 if (layout != DATA_LAYOUT_UNKNOWN)
@@ -775,38 +833,94 @@ void TFImporter::populateNet(Net dstNet)
                     if (it->second == DATA_LAYOUT_UNKNOWN)
                         it->second = layout;
                     else if (it->second != layout)
+                    {
                         it->second = DATA_LAYOUT_UNKNOWN;
+                        layout = DATA_LAYOUT_UNKNOWN;
+                    }
                 }
+                else
+                    layout = it->second;
             }
             else
                 data_layouts[name] = layout;
+
+            // Specify input layers to have the same data layout.
+            for (int j = 0; j < layer.input_size(); ++j)
+            {
+                name = getNodeName(layer.input(j));
+                it = data_layouts.find(name);
+                if (it != data_layouts.end())
+                {
+                    if (layout != DATA_LAYOUT_UNKNOWN)
+                    {
+                        if (it->second == DATA_LAYOUT_UNKNOWN)
+                            it->second = layout;
+                        else if (it->second != layout)
+                            it->second = DATA_LAYOUT_UNKNOWN;
+                    }
+                }
+                else
+                    data_layouts[name] = layout;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't propagate layout for node='" << name << "'. Exception: " << e.what());
+            throw;
         }
     }
 
-    // find all Const layers for params
-    std::map<String, int> value_id;
-    // A map with constant blobs which are shared between multiple layers.
-    std::map<String, Mat> sharedWeights;
     addConstNodes(netBin, value_id, layers_to_ignore);
     addConstNodes(netTxt, value_id, layers_to_ignore);
 
-    std::map<String, int> layer_id;
 
     for (int li = 0; li < layersSize; li++)
     {
-        tensorflow::NodeDef layer = net.node(li);
-        String name = layer.name();
-        String type = layer.op();
+        const tensorflow::NodeDef& layer = net.node(li);
+
+        const std::string name = layer.name();
+        const std::string type = layer.op();
+        const int ninputs = layer.input_size();
+        CV_LOG_DEBUG(NULL, "DNN/TF: (" << li << "/" << layersSize << ") Parse layer " << name << " @ " << type << " with " << ninputs << " inputs");
+
+        parseNode(layer);
+    }
+
+    for (size_t i = 0; i < netInputsNames.size(); i++)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: Model input: " << i << " - '" << netInputsNames[i] << "'");
+        CV_Assert(!netInputsNames[i].empty());
+    }
+    dstNet.setInputsNames(netInputsNames);
+    CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
+}
+
+void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
+{
+    tensorflow::NodeDef layer = layer_;
+
+    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+
+    /*const*/ std::string name = layer.name();
+    /*const*/ std::string type = layer.op();
+    /*const*/ int num_inputs = layer.input_size();
+
+    try
+    {
         LayerParams layerParams;
 
-        if(layers_to_ignore.find(name) != layers_to_ignore.end())
-            continue;
+        if (layers_to_ignore.find(name) != layers_to_ignore.end())
+        {
+            CV_LOG_DEBUG(NULL, "DNN/TF:     ignored");
+            return;
+        }
 
-        int predictedLayout = predictOutputDataLayout(net, layer, data_layouts);
+        DataLayout predictedLayout = predictOutputDataLayout(layer);
         data_layouts[name] = predictedLayout;
 
         if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad" || type == "MirrorPad" || type == "Conv3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             // The first node of dilated convolution subgraph.
             // Extract input node, dilation rate and paddings.
             std::string input = layer.input(0);
@@ -824,7 +938,7 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "input"
                 // input: "SpaceToBatchND/block_shape"
                 // input: "SpaceToBatchND/paddings"
-                CV_Assert(layer.input_size() == 3);
+                CV_CheckEQ(num_inputs, 3, "");
 
                 DictValue dilation = parseDims(getConstBlob(layer, value_id, 1));
                 CV_Assert(dilation.size() == 2);
@@ -839,10 +953,14 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("pad_w", paddings.at<float>(2));
 
                 CV_Assert(next_layers.size() == 1);
-                layer = net.node(next_layers[0].second);
                 layers_to_ignore.insert(next_layers[0].first);
+
+                // FIXIT don't override, rewrite this code
+                layer = net.node(next_layers[0].second);
                 name = layer.name();
                 type = layer.op();
+                num_inputs = layer.input_size();
+                CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
             }
             else if (type == "Pad" || type == "MirrorPad")
             {
@@ -876,7 +994,7 @@ void TFImporter::populateNet(Net dstNet)
                     layer_id[name] = id;
 
                     connect(layer_id, dstNet, parsePin(input), id, 0);
-                    continue;
+                    return;
                 }
                 else
                 {
@@ -886,10 +1004,14 @@ void TFImporter::populateNet(Net dstNet)
                     layerParams.set("pad_h", paddings.at<int32_t>(4));
                     layerParams.set("pad_w", paddings.at<int32_t>(6));
 
-                    layer = net.node(next_layers[0].second);
                     layers_to_ignore.insert(next_layers[0].first);
+
+                    // FIXIT don't override, rewrite this code
+                    layer = net.node(next_layers[0].second);
                     name = layer.name();
                     type = layer.op();
+                    num_inputs = layer.input_size();
+                    CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
                 }
             }
 
@@ -1011,13 +1133,14 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "BiasAdd" || type == "Add" || type == "AddV2" || type == "Sub" || type=="AddN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             bool haveConst = false;
-            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            for(int ii = 0; !haveConst && ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 haveConst = value_id.find(input.name) != value_id.end();
             }
-            CV_Assert(!haveConst || layer.input_size() == 2);
+            CV_Assert(!haveConst || num_inputs == 2);
 
             if (haveConst)
             {
@@ -1054,7 +1177,7 @@ void TFImporter::populateNet(Net dstNet)
                 int id = dstNet.addLayer(name, "Eltwise", layerParams);
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1065,7 +1188,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MatMul")
         {
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
 
             // For the object detection networks, TensorFlow Object Detection API
             // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
@@ -1077,7 +1200,7 @@ void TFImporter::populateNet(Net dstNet)
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
 
-            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");
+            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");  // FIXIT Use layers fusion instead
             if (next_layers.empty())
             {
                 next_layers = getNextLayers(net, name, "Add");
@@ -1135,8 +1258,9 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Reshape")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
-            int inpLayout = getDataLayout(layer.input(0), data_layouts);
+            DataLayout inpLayout = getDataLayout(layer.input(0), data_layouts);
             // There are two possible implementations: reshape an input using
             // predefined sizes or use a second input blob as a source of new shape.
             if (value_id.find(layer.input(1)) != value_id.end())
@@ -1185,6 +1309,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Flatten" || type == "Squeeze")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Pin inpId = parsePin(layer.input(0));
             int inpLayout = getDataLayout(layer.input(0), data_layouts);
             if (type == "Squeeze")
@@ -1231,6 +1356,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Transpose")
         {
+            CV_CheckGT(num_inputs, 0, "");
             Mat perm = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(perm.type() == CV_32SC1);
             int* permData = (int*)perm.data;
@@ -1304,6 +1430,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "LRN")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if(hasLayerAttr(layer, "alpha")) {
                 layerParams.set("alpha", getLayerAttr(layer, "alpha").f());
             }
@@ -1322,11 +1449,12 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "LRN", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "Concat" || type == "ConcatV2")
         {
-            int axisId = (type == "Concat" ? 0 : layer.input_size() - 1);
+            CV_CheckGT(num_inputs, 0, "");
+            int axisId = (type == "Concat" ? 0 : num_inputs - 1);
             int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
 
             if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
@@ -1337,7 +1465,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // input(0) or input(n-1) is concat_dim
             int from = (type == "Concat" ? 1 : 0);
-            int to = (type == "Concat" ? layer.input_size() : layer.input_size() - 1);
+            int to = (type == "Concat" ? num_inputs : num_inputs - 1);
 
             for (int ii = from; ii < to; ii++)
             {
@@ -1370,6 +1498,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "MaxPool" || type == "MaxPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "max");
 
             setKSize(layerParams, layer);
@@ -1381,10 +1510,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "AvgPool" || type == "AvgPool3D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             layerParams.set("pool", "ave");
             layerParams.set("ave_pool_padded_area", false);
             setKSize(layerParams, layer);
@@ -1394,11 +1524,11 @@ void TFImporter::populateNet(Net dstNet)
             int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "MaxPoolGrad")
         {
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             layerParams.set("pool_k_h", 0);
             layerParams.set("pool_k_w", 0);
@@ -1457,7 +1587,7 @@ void TFImporter::populateNet(Net dstNet)
             // TODO: slicing input may be Const op
             // TODO: slicing kernels for convolutions - in current implementation it is impossible
             // TODO: add parsing num of slices parameter
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             // num_split
             // 1st blob is dims tensor
             int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
@@ -1480,7 +1610,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input_node"
             // input: "Slice/begin"
             // input: "Slice/size"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat sizes = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_Assert_N(!begins.empty(), !sizes.empty());
@@ -1505,7 +1635,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "StridedSlice")
         {
-            CV_Assert(layer.input_size() == 4);
+            CV_CheckEQ(num_inputs, 4, "");
             Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
             Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
@@ -1544,8 +1674,9 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Mul" || type == "RealDiv")
         {
+            CV_CheckGT(num_inputs, 0, "");
             int constId = -1;
-            for(int ii = 0; ii < layer.input_size(); ++ii)
+            for(int ii = 0; ii < num_inputs; ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
                 if (value_id.find(input.name) != value_id.end())
@@ -1554,12 +1685,12 @@ void TFImporter::populateNet(Net dstNet)
                     break;
                 }
             }
-            CV_Assert((constId != -1) || (layer.input_size() == 2));
+            CV_Assert((constId != -1) || (num_inputs == 2));
 
             if (constId != -1)
             {
                 // Multiplication by constant.
-                CV_Assert(layer.input_size() == 2);
+                CV_CheckEQ(num_inputs, 2, "");
                 Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
                 CV_Assert(scaleMat.type() == CV_32FC1);
                 if (type == "RealDiv")
@@ -1643,7 +1774,7 @@ void TFImporter::populateNet(Net dstNet)
                 // Check if all the inputs have the same shape.
                 bool equalInpShapes = true;
                 MatShape outShape0;
-                for (int ii = 0; ii < layer.input_size() && !netInputShapes.empty(); ii++)
+                for (int ii = 0; ii < num_inputs && !netInputShapes.empty(); ii++)
                 {
                     Pin pin = parsePin(layer.input(ii));
                     int inpId = layer_id.find(pin.name)->second;
@@ -1681,7 +1812,7 @@ void TFImporter::populateNet(Net dstNet)
 
                 layer_id[name] = id;
 
-                for (int ii = 0; ii < layer.input_size(); ii++)
+                for (int ii = 0; ii < num_inputs; ii++)
                 {
                     Pin inp = parsePin(layer.input(ii));
                     if (layer_id.find(inp.name) == layer_id.end())
@@ -1698,9 +1829,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "BatchNorm/beta"
             // input: "BatchNorm/moving_mean"
             // input: "BatchNorm/moving_variance"
-            if (layer.input_size() != 5)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected gamma, beta, mean and std");
+            CV_CheckEQ(num_inputs, 5, "Expected gamma, beta, mean and std");
             Pin inpId = parsePin(layer.input(0));
 
             bool isTraining = hasLayerAttr(layer, "is_training") && getLayerAttr(layer, "is_training").b();
@@ -1768,9 +1897,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "conv2d_transpose/output_shape"
             // input: "weights"
             // input: "input"
-            if (layer.input_size() != 3)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected output shape, weights and input nodes");
+            CV_CheckEQ(num_inputs, 3, "Expected output shape, weights and input nodes");
 
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
@@ -1845,8 +1972,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "lstm_block_wrapper/w_f_diag"
             // input: "lstm_block_wrapper/w_o_diag"
             // input: "lstm_block_wrapper/bias"
-            if (layer.input_size() != 9)
-                CV_Error(Error::StsNotImplemented, "Unexpected number of input nodes");
+            CV_CheckEQ(num_inputs, 9, "Unexpected number of input nodes");
 
             if (hasLayerAttr(layer, "forget_bias"))
                 layerParams.set("forget_bias", getLayerAttr(layer, "forget_bias").f());
@@ -1912,6 +2038,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "ResizeNearestNeighbor" || type == "ResizeBilinear" || type == "FusedResizeAndPadConv2D")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string convWeights = "";
             if (type == "FusedResizeAndPadConv2D")
             {
@@ -1919,30 +2046,32 @@ void TFImporter::populateNet(Net dstNet)
                 // input: "decoder/ResizeBilinear/size"
                 // input: "decoder/decoder_conv0/Conv2D_dummy_paddings"
                 // input: "decoder/decoder_conv0/weights"
-                CV_CheckEQ(layer.input_size(), 4, "Number of input for FusedResizeAndPadConv2D");
+                CV_CheckEQ(num_inputs, 4, "Number of input for FusedResizeAndPadConv2D");
 
                 Mat paddings = getTensorContent(getConstBlob(layer, value_id, 2));
                 CV_CheckEQ(countNonZero(paddings), 0, "Unsupported mode");
 
                 convWeights = layer.input(3);
-                layer.mutable_input()->DeleteSubrange(2, 2);
+                layer.mutable_input()->DeleteSubrange(2, 2);  // FIXIT do NOT modify input model
+                num_inputs = layer.input_size();
                 name = name + "/resize";
 
                 if (hasLayerAttr(layer, "resize_align_corners"))
                 {
+                    // FIXIT do NOT modify input model
                     layer.mutable_attr()->insert(
                         ::google::protobuf::MapPair<std::string, tensorflow::AttrValue>("align_corners",
                                                                                         getLayerAttr(layer, "resize_align_corners")));
                 }
             }
-            if (layer.input_size() == 2)
+            if (num_inputs == 2)
             {
                 Mat outSize = getTensorContent(getConstBlob(layer, value_id, 1));
                 CV_CheckTypeEQ(outSize.type(), CV_32SC1, ""); CV_CheckEQ(outSize.total(), (size_t)2, "");
                 layerParams.set("height", outSize.at<int>(0, 0));
                 layerParams.set("width", outSize.at<int>(0, 1));
             }
-            else if (layer.input_size() == 3)
+            else if (num_inputs == 3)
             {
                 Mat factorHeight = getTensorContent(getConstBlob(layer, value_id, 1));
                 Mat factorWidth = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -1952,7 +2081,7 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("zoom_factor_y", factorHeight.at<float>(0));
             }
             else
-                CV_Assert(layer.input_size() == 2 || layer.input_size() == 3);
+                CV_Check(num_inputs, num_inputs == 2 || num_inputs == 3, "");
 
             if (type == "ResizeNearestNeighbor")
                 layerParams.set("interpolation", "nearest");
@@ -1973,12 +2102,12 @@ void TFImporter::populateNet(Net dstNet)
             // Step back to add convolution
             if (type == "FusedResizeAndPadConv2D")
             {
-                tensorflow::NodeDef* conv = net.mutable_node(li);
-                conv->clear_input();
-                conv->add_input(name);
-                conv->add_input(convWeights);
-                conv->set_op("Conv2D");
-                li -= 1;
+                tensorflow::NodeDef conv = layer_;
+                conv.clear_input();
+                conv.add_input(name);
+                conv.add_input(convWeights);
+                conv.set_op("Conv2D");
+                parseNode(conv);
             }
         }
         else if (type == "L2Normalize")
@@ -1986,7 +2115,7 @@ void TFImporter::populateNet(Net dstNet)
             // op: "L2Normalize"
             // input: "input"
             // input: "reduction_indices" (axis)
-            CV_Assert(layer.input_size() == 2);
+            CV_CheckEQ(num_inputs, 2, "");
             Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(reductionIndices.type() == CV_32SC1);
 
@@ -2011,6 +2140,7 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "PriorBox")
         {
+            CV_CheckEQ(num_inputs, 2, "");
             if (hasLayerAttr(layer, "min_size"))
                 layerParams.set("min_size", getLayerAttr(layer, "min_size").i());
             if (hasLayerAttr(layer, "max_size"))
@@ -2043,12 +2173,13 @@ void TFImporter::populateNet(Net dstNet)
         }
         else if (type == "Softmax")
         {
+            CV_CheckGT(num_inputs, 0, "");
             if (hasLayerAttr(layer, "axis"))
                 layerParams.set("axis", getLayerAttr(layer, "axis").i());
 
             int id = dstNet.addLayer(name, "Softmax", layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else if (type == "CropAndResize")
         {
@@ -2056,7 +2187,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "boxes"
             // input: "sizes"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
             CV_CheckTypeEQ(cropSize.type(), CV_32SC1, ""); CV_CheckEQ(cropSize.total(), (size_t)2, "");
@@ -2084,6 +2215,7 @@ void TFImporter::populateNet(Net dstNet)
             // determine out shape: NxCxHxW --Slice--> 1xCxHxW
             //                      out_shape = 1xCxHxW if keepDims else (1xCxHxW --Flatten--> CxHxW)
             // global pool: NxCxHxW --Flatten--> Nx(C*H*W) --Reshape--> 1x1xNx(C*H*W) --Pooling--> 1x1x1x(C*H*W) --Reshape--> out_shape
+            CV_CheckGT(num_inputs, 0, "");
 
             Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
             CV_Assert(indices.type() == CV_32SC1);
@@ -2218,6 +2350,7 @@ void TFImporter::populateNet(Net dstNet)
             // Example: given a list with "N" tensors of shape (C, H, W):
             // if axis == 0 then the output tensor will have the shape (N, C, H, W),
             // if axis == 1 then the output tensor will have the shape (C, N, H, W).
+            CV_CheckGT(num_inputs, 0, "");
             CV_Assert(hasLayerAttr(layer, "axis"));
             int dim = (int)getLayerAttr(layer, "axis").i();
             if (dim != 0)
@@ -2225,7 +2358,7 @@ void TFImporter::populateNet(Net dstNet)
 
             CV_Assert(hasLayerAttr(layer, "N"));
             int num = (int)getLayerAttr(layer, "N").i();
-            CV_Assert(layer.input_size() == num);
+            CV_CheckEQ(num_inputs, num, "");
             std::string base_name = name + "/reshape_";
             std::vector<int> reshape_ids;
             for (int i = 0; i < num; i++) {
@@ -2256,7 +2389,7 @@ void TFImporter::populateNet(Net dstNet)
             // input: "input"
             // input: "mix"
             // input: "max"
-            CV_Assert(layer.input_size() == 3);
+            CV_CheckEQ(num_inputs, 3, "");
 
             Mat minValue = getTensorContent(getConstBlob(layer, value_id, 1));
             Mat maxValue = getTensorContent(getConstBlob(layer, value_id, 2));
@@ -2275,6 +2408,7 @@ void TFImporter::populateNet(Net dstNet)
                  type == "Relu" || type == "Elu" ||
                  type == "Identity" || type == "Relu6")
         {
+            CV_CheckGT(num_inputs, 0, "");
             std::string dnnType = type;
             if (type == "Abs") dnnType = "AbsVal";
             else if (type == "Tanh") dnnType = "TanH";
@@ -2284,7 +2418,7 @@ void TFImporter::populateNet(Net dstNet)
 
             int id = dstNet.addLayer(name, dnnType, layerParams);
             layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
+            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
         }
         else
         {
@@ -2308,7 +2442,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // All the Const input nodes are added to layer's blobs.
             std::vector<std::string> inputsNames;
-            for (int i = 0; i < layer.input_size(); ++i)
+            for (int i = 0; i < num_inputs; ++i)
             {
                 // Check if input is a Const node.
                 if (value_id.find(layer.input(i)) != value_id.end())
@@ -2328,7 +2462,11 @@ void TFImporter::populateNet(Net dstNet)
             }
         }
     }
-    dstNet.setInputsNames(netInputsNames);
+    catch (const std::exception& e)
+    {
+        CV_LOG_ERROR(NULL, "DNN/TF: Can't parse layer for node='" << name << "'. Exception: " << e.what());
+        throw;
+    }
 }
 
 } // namespace
@@ -2337,18 +2475,16 @@ void TFImporter::populateNet(Net dstNet)
 
 Net readNetFromTensorflow(const String &model, const String &config)
 {
-    TFImporter importer(model.c_str(), config.c_str());
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, model.c_str(), config.c_str());
     return net;
 }
 
 Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
                           const char* bufferConfig, size_t lenConfig)
 {
-    TFImporter importer(bufferModel, lenModel, bufferConfig, lenConfig);
     Net net;
-    importer.populateNet(net);
+    TFImporter importer(net, bufferModel, lenModel, bufferConfig, lenConfig);
     return net;
 }
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 533c2234c5..337c12826d 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -587,7 +587,7 @@ enum ColorConversionCodes {
     COLOR_YCrCb2BGR    = 38,
     COLOR_YCrCb2RGB    = 39,
 
-    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value), @ref color_convert_rgb_hsv "color conversions"
+    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV      = 41,
 
     COLOR_BGR2Lab      = 44, //!< convert RGB/BGR to CIE Lab, @ref color_convert_rgb_lab "color conversions"
@@ -595,27 +595,27 @@ enum ColorConversionCodes {
 
     COLOR_BGR2Luv      = 50, //!< convert RGB/BGR to CIE Luv, @ref color_convert_rgb_luv "color conversions"
     COLOR_RGB2Luv      = 51,
-    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation), @ref color_convert_rgb_hls "color conversions"
+    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS      = 53,
 
-    COLOR_HSV2BGR      = 54, //!< backward conversions to RGB/BGR
+    COLOR_HSV2BGR      = 54, //!< backward conversions HSV to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HSV2RGB      = 55,
 
     COLOR_Lab2BGR      = 56,
     COLOR_Lab2RGB      = 57,
     COLOR_Luv2BGR      = 58,
     COLOR_Luv2RGB      = 59,
-    COLOR_HLS2BGR      = 60,
+    COLOR_HLS2BGR      = 60, //!< backward conversions HLS to RGB/BGR with H range 0..180 if 8 bit image
     COLOR_HLS2RGB      = 61,
 
-    COLOR_BGR2HSV_FULL = 66,
+    COLOR_BGR2HSV_FULL = 66, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
     COLOR_RGB2HSV_FULL = 67,
-    COLOR_BGR2HLS_FULL = 68,
+    COLOR_BGR2HLS_FULL = 68, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
     COLOR_RGB2HLS_FULL = 69,
 
-    COLOR_HSV2BGR_FULL = 70,
+    COLOR_HSV2BGR_FULL = 70, //!< backward conversions HSV to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HSV2RGB_FULL = 71,
-    COLOR_HLS2BGR_FULL = 72,
+    COLOR_HLS2BGR_FULL = 72, //!< backward conversions HLS to RGB/BGR with H range 0..255 if 8 bit image
     COLOR_HLS2RGB_FULL = 73,
 
     COLOR_LBGR2Lab     = 74,
diff --git a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
index 15330fc8b0..1bf3d2fdd8 100644
--- a/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
+++ b/samples/python/tutorial_code/video/background_subtraction/bg_sub.py
@@ -18,7 +18,7 @@ else:
 
 ## [capture]
 capture = cv.VideoCapture(cv.samples.findFileOrKeep(args.input))
-if not capture.isOpened:
+if not capture.isOpened():
     print('Unable to open: ' + args.input)
     exit(0)
 ## [capture]