Merge remote-tracking branch 'upstream/3.4' into merge-3.4

4 years ago · d643a90efa
parent b450dd7a87 04a9ff88d8
commit d643a90efa
15 changed files with 651 additions and 370 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -999,6 +999,12 @@ if(COMMAND ocv_pylint_finalize)
  ocv_pylint_add_directory_recurse(${CMAKE_CURRENT_LIST_DIR}/samples/python/tutorial_code)
  ocv_pylint_finalize()
 endif()
+if(TARGET check_pylint)
+  message(STATUS "Registered 'check_pylint' target: using ${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}), checks: ${PYLINT_TOTAL_TARGETS}")
+endif()
+if(TARGET check_flake8)
+  message(STATUS "Registered 'check_flake8' target: using ${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})")
+endif()

 if(OPENCV_GENERATE_SETUPVARS)
  include(cmake/OpenCVGenSetupVars.cmake)
@ -1628,12 +1634,6 @@ endif()

 status("")
 status("  Python (for build):"  PYTHON_DEFAULT_AVAILABLE THEN "${PYTHON_DEFAULT_EXECUTABLE}" ELSE NO)
-if(PYLINT_FOUND AND PYLINT_EXECUTABLE)
-  status("    Pylint:"  PYLINT_FOUND THEN "${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}, checks: ${PYLINT_TOTAL_TARGETS})" ELSE NO)
-endif()
-if(FLAKE8_FOUND AND FLAKE8_EXECUTABLE)
-  status("    Flake8:"  FLAKE8_FOUND THEN "${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})" ELSE NO)
-endif()

 # ========================== java ==========================
 if(BUILD_JAVA)
--- a/cmake/FindPylint.cmake
+++ b/cmake/FindPylint.cmake
@ -16,7 +16,7 @@ if(PYLINT_EXECUTABLE AND NOT DEFINED PYLINT_VERSION)
  execute_process(COMMAND ${PYLINT_EXECUTABLE} --version RESULT_VARIABLE _result OUTPUT_VARIABLE PYLINT_VERSION_RAW)
  if(NOT _result EQUAL 0)
    ocv_clear_vars(PYLINT_EXECUTABLE PYLINT_VERSION)
-  elseif(PYLINT_VERSION_RAW MATCHES "pylint([^,]*) ([0-9\\.]+[0-9])")
+  elseif(PYLINT_VERSION_RAW MATCHES "pylint([^,\n]*) ([0-9\\.]+[0-9])")
    set(PYLINT_VERSION "${CMAKE_MATCH_2}")
  else()
    set(PYLINT_VERSION "unknown")
--- a/cmake/OpenCVPylint.cmake
+++ b/cmake/OpenCVPylint.cmake
@ -122,7 +122,6 @@ function(ocv_pylint_finalize)

  list(LENGTH PYLINT_TARGET_ID __total)
  set(PYLINT_TOTAL_TARGETS "${__total}" CACHE INTERNAL "")
-  message(STATUS "Pylint: registered ${__total} targets. Build 'check_pylint' target to run checks (\"cmake --build . --target check_pylint\" or \"make check_pylint\")")
  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/pylint.cmake.in" "${CMAKE_BINARY_DIR}/pylint.cmake" @ONLY)

  add_custom_target(check_pylint
--- a/modules/calib3d/src/polynom_solver.cpp
+++ b/modules/calib3d/src/polynom_solver.cpp
@ -65,7 +65,8 @@ int solve_deg3(double a, double b, double c, double d,
      return 3;
    }
    else {
-      x0 = pow(2 * R, 1 / 3.0) - b_a_3;
+      double cube_root = cv::cubeRoot(2 * R);
+      x0 = cube_root - b_a_3;
      return 1;
    }
  }
@ -82,8 +83,15 @@ int solve_deg3(double a, double b, double c, double d,
  }

  // D > 0, only one real root
-  double AD = pow(fabs(R) + sqrt(D), 1.0 / 3.0) * (R > 0 ? 1 : (R < 0 ? -1 : 0));
-  double BD = (AD == 0) ? 0 : -Q / AD;
+  double AD = 0.;
+  double BD = 0.;
+  double R_abs = fabs(R);
+  if (R_abs > DBL_EPSILON)
+  {
+    AD = cv::cubeRoot(R_abs + sqrt(D));
+    AD = (R >= 0) ? AD : -AD;
+    BD = -Q / AD;
+  }

  // Calculate the only real root
  x0 = AD + BD - b_a_3;
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -538,6 +538,16 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
 */
 CV_EXPORTS_W float cubeRoot(float val);

+/** @overload
+
+cubeRoot with argument of `double` type calls `std::cbrt(double)`
+*/
+static inline
+double cubeRoot(double val)
+{
+    return std::cbrt(val);
+}
+
 /** @brief Calculates the angle of a 2D vector in degrees.

 The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -559,27 +559,6 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
    return c; \
 }

-//! @brief Helper macro
-//! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
-inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 4; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-} \
-inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 2; i++ ) \
-    { \
-        c.s[i] = cfunc(a.s[i]); \
-        c.s[i + 2] = 0; \
-    } \
-    return c; \
-}
-
 /** @brief Square root of elements

 Only for floating point types.*/
@ -598,26 +577,6 @@ Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
                          typename V_TypeTraits<_Tp>::abs_type)

-/** @brief Round elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound)
-
-/** @brief Floor elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor)
-
-/** @brief Ceil elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil)
-
-/** @brief Truncate elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int)
-
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
@ -855,9 +814,9 @@ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp,
 /** @overload

 For 32-bit floating point values */
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
 {
-    v_float32x4 c;
+    v_reg<float, n> c;
    for( int i = 0; i < c.nlanes; i++ )
        c.s[i] = _absdiff(a.s[i], b.s[i]);
    return c;
@ -866,9 +825,9 @@ inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
 /** @overload

 For 64-bit floating point values */
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
 {
-    v_float64x2 c;
+    v_reg<double, n> c;
    for( int i = 0; i < c.nlanes; i++ )
        c.s[i] = _absdiff(a.s[i], b.s[i]);
    return c;
@ -1238,14 +1197,17 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
 result[3] = d[0] + d[1] + d[2] + d[3]
 @endcode
 */
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
-{
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
+template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
+    const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> r;
+    for(int i = 0; i < (n/4); i++)
+    {
+        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
+        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
+        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
+        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
+    }
    return r;
 }

@ -1965,9 +1927,11 @@ inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
    return v_reg<_Tp, n>::all(a.s[i]);
 }

-/** @brief Round
+/** @brief Round elements

-Rounds each value. Input type is float vector ==> output type is int vector.*/
+Rounds each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@ -1988,9 +1952,11 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const
    return c;
 }

-/** @brief Floor
+/** @brief Floor elements

-Floor each value. Input type is float vector ==> output type is int vector.*/
+Floor each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@ -1999,9 +1965,11 @@ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
    return c;
 }

-/** @brief Ceil
+/** @brief Ceil elements

-Ceil each value. Input type is float vector ==> output type is int vector.*/
+Ceil each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@ -2010,9 +1978,11 @@ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
    return c;
 }

-/** @brief Trunc
+/** @brief Truncate elements

-Truncate each value. Input type is float vector ==> output type is int vector.*/
+Truncate each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@ -2036,7 +2006,7 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
    for( int i = 0; i < n; i++ )
    {
        c.s[i] = cvFloor(a.s[i]);
@ -2048,7 +2018,7 @@ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
    for( int i = 0; i < n; i++ )
    {
        c.s[i] = cvCeil(a.s[i]);
@ -2060,10 +2030,10 @@ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
    for( int i = 0; i < n; i++ )
    {
-        c.s[i] = cvCeil(a.s[i]);
+        c.s[i] = (int)(a.s[i]);
        c.s[i+n] = 0;
    }
    return c;
@ -2105,11 +2075,10 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, co
 /** @brief Convert to double

 Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
+template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
        c.s[i] = (double)a.s[i];
    return c;
 }
@ -2117,23 +2086,21 @@ CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
 /** @brief Convert to double high part of vector

 Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
    return c;
 }

 /** @brief Convert to double

 Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
        c.s[i] = (double)a.s[i];
    return c;
 }
@ -2141,33 +2108,19 @@ CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
 /** @brief Convert to double high part of vector

 Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<float, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
    return c;
 }

 /** @brief Convert to double

 Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int64, 2>& a)
-{
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-/** @brief Convert to double high part of vector
-
-Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
+template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
 {
-    enum { n = 2 };
    v_reg<double, n> c;
    for( int i = 0; i < n; i++ )
        c.s[i] = (double)a.s[i];
@ -2221,36 +2174,15 @@ template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int,
    return c;
 }

-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
 {
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, n/2> c;
+    for( int i = 0; i < n/2; i++ )
        c.s[i] = tab[idx.s[i]];
    return c;
 }


-inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-
 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
                                               v_reg<float, n>& x, v_reg<float, n>& y)
 {
@ -2330,16 +2262,23 @@ b2  {A3 B3 C3 D3}
 b3  {A4 B4 C4 D4}
@endcode
 */
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+template<typename _Tp, int n>
+inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
+                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
+                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
 {
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+    for (int i = 0; i < n / 4; i++)
+    {
+        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
+        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
+        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
+        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
+        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
+        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
+        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
+        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
+    }
 }

 //! @brief Helper macro
@ -2384,92 +2323,92 @@ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
+template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
+{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }

 //! @name Reinterpret
 //! @{
 //! @brief Convert vector to different type without modifying underlying data.
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
+{ return a << shift; }

 //! @name Left shift
 //! @{
 //! @brief Shift left
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTL(ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(short)
+OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(int)
+OPENCV_HAL_IMPL_C_SHIFTL(uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
+{ return a >> shift; }

 //! @name Right shift
 //! @{
 //! @brief Shift right
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTR(ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(short)
+OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(int)
+OPENCV_HAL_IMPL_C_SHIFTR(uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(int64)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
 { \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
    return c; \
 }

 //! @name Rounding shift
 //! @{
 //! @brief Rounding shift right
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(short)
+OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(int)
+OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(int64)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
    { \
        c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
+        c.s[i+n] = cast<_Tpn>(b.s[i]); \
    } \
    return c; \
 }
@ -2485,26 +2424,26 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
    { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
    } \
    return c; \
 }
@ -2520,22 +2459,22 @@ template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    for( int i = 0; i < n; i++ ) \
        ptr[i] = cast<_Tpn>(a.s[i]); \
 }

@ -2550,23 +2489,23 @@ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}

 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
 }

 //! @name Pack and store with rounding shift
@ -2580,14 +2519,14 @@ template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}

 //! @cond IGNORED
@ -2622,9 +2561,9 @@ b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
 }
@endcode */

-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 2*n> mask;
    _pack_b(mask.s, a, b);
    return mask;
 }
@ -2645,12 +2584,12 @@ d  {0 0xFFFF.. 0 0xFFFF..}
 }
@endcode */

-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
+template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
+                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 4*n> mask;
    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
+    _pack_b(mask.s + 2*n, c, d);
    return mask;
 }

@ -2674,15 +2613,16 @@ h  {0 0xFFFF..}
   0xFF 0 0xFF 0 0 0xFF 0 0xFF
 }
@endcode */
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
+template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
+                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
+                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
+                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 8*n> mask;
    _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
+    _pack_b(mask.s + 2*n, c, d);
+    _pack_b(mask.s + 4*n, e, f);
+    _pack_b(mask.s + 6*n, g, h);
    return mask;
 }
 //! @}
@ -2697,54 +2637,68 @@ Scheme:
 {D0 D1 D2 D3} x |V3|
 ====================
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + A3V3,
-R1 = B0V0 + B1V1 + B2V2 + B3V3
+R0 = A0V0 + B0V1 + C0V2 + D0V3,
+R1 = A1V0 + B1V1 + C1V2 + D1V3
 ...
@endcode
 */
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
+                                const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
+        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
+        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
+        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
+    }
+    return res;
 }

 /** @brief Matrix multiplication and add

 Scheme:
@code
-{A0 A1 A2   }   |V0|   |D0|
-{B0 B1 B2   }   |V1|   |D1|
-{C0 C1 C2   } x |V2| + |D2|
-====================
+{A0 A1 A2 A3}   |V0|   |D0|
+{B0 B1 B2 B3}   |V1|   |D1|
+{C0 C1 C2 C3} x |V2| + |D2|
+====================   |D3|
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + D0,
-R1 = B0V0 + B1V1 + B2V2 + D1
+R0 = A0V0 + B0V1 + C0V2 + D0,
+R1 = A1V0 + B1V1 + C1V2 + D1
 ...
@endcode
 */
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
+                                   const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                   const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
+        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
+        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
+        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
+    }
+    return res;
 }


-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                           const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }

-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_dotprod_expand(a, b); }
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                                const v_reg<double, n/2>& c)
 { return v_dotprod_expand(a, b, c); }

 ////// FP16 support ///////
@ -2760,8 +2714,8 @@ v_load_expand(const float16_t* ptr)
    return v;
 }

-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+template<int n> inline void
+v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
 {
    for( int i = 0; i < v.nlanes; i++ )
    {
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -558,6 +558,29 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

+                void setSAM(int from)
+                {
+                    cv::dnn::LayerParams eltwise_param;
+                    eltwise_param.name = "SAM-name";
+                    eltwise_param.type = "Eltwise";
+
+                    eltwise_param.set<std::string>("operation", "prod");
+                    eltwise_param.set<std::string>("output_channels_mode", "same");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("sam_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = eltwise_param.type;
+                    lp.layerParams = eltwise_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
                void setUpsample(int scaleFactor)
                {
                    cv::dnn::LayerParams param;
@ -837,6 +860,14 @@ namespace cv {
                        from = from < 0 ? from + layers_counter : from;
                        setParams.setScaleChannels(from);
                    }
+                    else if (layer_type == "sam")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setSAM(from);
+                    }
                    else if (layer_type == "upsample")
                    {
                        int scaleFactor = getParam<int>(layer_params, "stride", 1);
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@ -772,8 +772,14 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
 {
    if (m.dims == 4)
        return InferenceEngine::Layout::NCHW;
+    else if (m.dims == 3)
+        return InferenceEngine::Layout::CHW;
    else if (m.dims == 2)
        return InferenceEngine::Layout::NC;
+    else if (m.dims == 1)
+        return InferenceEngine::Layout::C;
+    else if (m.dims == 5)
+        return InferenceEngine::Layout::NCDHW;
    else
        return InferenceEngine::Layout::ANY;
 }
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -295,6 +295,22 @@ DataLayout getDataLayout(
    return it != data_layouts.end() ? it->second : DATA_LAYOUT_UNKNOWN;
 }

+static
+bool hasAllOnes(const Mat &inputs, int startPos, int endPos)
+{
+    CV_CheckLE(inputs.dims, 2, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLT((size_t)endPos, inputs.total(), "");
+
+    for (int i = startPos; i < endPos; i++)
+    {
+        if (inputs.at<int>(i) != 1 || inputs.at<int>(i)!= -1)
+            return false;
+    }
+    return true;
+}
+
 void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 {
    if (hasLayerAttr(layer, "strides"))
@ -490,6 +506,9 @@ protected:
    std::map<String, Mat> sharedWeights;

    std::map<String, int> layer_id;
+
+private:
+    void addPermuteLayer(const int* order, const std::string& permName, Pin& inpId);
 };

 TFImporter::TFImporter(Net& net, const char *model, const char *config)
@ -895,6 +914,17 @@ void TFImporter::populateNet()
    CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
 }

+void TFImporter::addPermuteLayer(const int* order, const std::string& permName, Pin& inpId)
+{
+    LayerParams permLP;
+    permLP.set("order", DictValue::arrayInt<const int*>(order, 4));
+    CV_Assert(layer_id.find(permName) == layer_id.end());
+    int permId = dstNet.addLayer(permName, "Permute", permLP);
+    layer_id[permName] = permId;
+    connect(layer_id, dstNet, inpId, permId, 0);
+    inpId = Pin(permName);
+}
+
 void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
 {
    tensorflow::NodeDef layer = layer_;
@ -1276,37 +1306,49 @@ void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
            if (value_id.find(layer.input(1)) != value_id.end())
            {
                Mat newShape = getTensorContent(getConstBlob(layer, value_id, 1));
-                if (newShape.total() == 4)
+                int newShapeSize = newShape.total();
+                bool hasSwap = false;
+                if (newShapeSize == 4 && hasAllOnes(newShape, 0, 2))
                {
                    // NHWC->NCHW
                    std::swap(*newShape.ptr<int32_t>(0, 2), *newShape.ptr<int32_t>(0, 3));
                    std::swap(*newShape.ptr<int32_t>(0, 1), *newShape.ptr<int32_t>(0, 2));
+                    hasSwap = true;
                }
                if (inpLayout == DATA_LAYOUT_NHWC)
                {
-                    if (newShape.total() != 4 || newShape.at<int>(1) == 1)
+                    if (newShapeSize >= 2 || newShape.at<int>(1) == 1)
                    {
-                        LayerParams permLP;
                        int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
-
-                        std::string permName = name + "/nchw";
-                        CV_Assert(layer_id.find(permName) == layer_id.end());
-                        int permId = dstNet.addLayer(permName, "Permute", permLP);
-                        layer_id[permName] = permId;
-                        connect(layer_id, dstNet, inpId, permId, 0);
-                        inpId = Pin(permName);
-                        inpLayout = DATA_LAYOUT_NCHW;
+                        addPermuteLayer(order, name + "/nhwc", inpId);
+                        if (newShapeSize < 4)
+                        {
+                            inpLayout = DATA_LAYOUT_NCHW;
+                        }
+                        else
+                        {
+                            inpLayout = DATA_LAYOUT_NHWC;
+                        }
                    }
                }
-                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShape.total()));
+                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShapeSize));

                int id = dstNet.addLayer(name, "Reshape", layerParams);
                layer_id[name] = id;

                // one input only
                connect(layer_id, dstNet, inpId, id, 0);
-                data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
+                inpId = Pin(name);
+
+                if ((inpLayout == DATA_LAYOUT_NHWC || inpLayout == DATA_LAYOUT_UNKNOWN || inpLayout == DATA_LAYOUT_PLANAR) &&
+                    newShapeSize == 4 && !hasSwap)
+                {
+                    int order[] = {0, 3, 1, 2};  // Transform back to OpenCV's NCHW.
+                    addPermuteLayer(order, name + "/nchw", inpId);
+                    inpLayout = DATA_LAYOUT_NCHW;
+                }
+
+                data_layouts[name] = newShapeSize == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
            }
            else
            {
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -803,6 +803,11 @@ TEST_P(Test_Darknet_layers, relu)
    testDarknetLayer("relu");
 }

+TEST_P(Test_Darknet_layers, sam)
+{
+    testDarknetLayer("sam", true);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -478,6 +478,16 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
    runTensorFlowNet("unfused_flatten_unknown_batch");
 }

+TEST_P(Test_TensorFlow_layers, reshape_layer)
+{
+    runTensorFlowNet("reshape_layer");
+}
+
+TEST_P(Test_TensorFlow_layers, reshape_nchw)
+{
+    runTensorFlowNet("reshape_nchw");
+}
+
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@ -84,12 +84,13 @@ ExrDecoder::ExrDecoder()
 {
    m_signature = "\x76\x2f\x31\x01";
    m_file = 0;
-    m_red = m_green = m_blue = 0;
+    m_red = m_green = m_blue = m_alpha = 0;
    m_type = ((Imf::PixelType)0);
    m_iscolor = false;
    m_bit_depth = 0;
    m_isfloat = false;
    m_ischroma = false;
+    m_hasalpha = false;
    m_native_depth = false;

 }
@ -113,7 +114,7 @@ void  ExrDecoder::close()

 int  ExrDecoder::type() const
 {
-    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), m_iscolor ? 3 : 1);
+    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), ((m_iscolor && m_hasalpha) ? 4 : m_iscolor ? 3 : m_hasalpha ? 2 : 1));
 }


@ -141,6 +142,11 @@ bool  ExrDecoder::readHeader()
    m_red = channels.findChannel( "R" );
    m_green = channels.findChannel( "G" );
    m_blue = channels.findChannel( "B" );
+    m_alpha = channels.findChannel( "A" );
+
+    if( m_alpha ) // alpha channel supported in RGB, Y, and YC scenarios
+        m_hasalpha = true;
+
    if( m_red || m_green || m_blue )
    {
        m_iscolor = true;
@ -178,7 +184,8 @@ bool  ExrDecoder::readHeader()
 bool  ExrDecoder::readData( Mat& img )
 {
    m_native_depth = CV_MAT_DEPTH(type()) == img.depth();
-    bool color = img.channels() > 1;
+    bool color = img.channels() > 2; // output mat has 3+ channels; Y or YA are the 1 and 2 channel scenario
+    bool alphasupported = ( img.channels() % 2 == 0 );  // even number of channels indicates alpha
    int channels = 0;
    uchar* data = img.ptr();
    size_t step = img.step;
@ -187,18 +194,22 @@ bool  ExrDecoder::readData( Mat& img )
    bool rgbtogray = ( !m_ischroma && m_iscolor && !color );
    bool result = true;
    FrameBuffer frame;
-    int xsample[3] = {1, 1, 1};
+    const int defaultchannels = 3;
+    int xsample[defaultchannels] = {1, 1, 1};
    char *buffer;
-    size_t xstep = 0;
+    CV_Assert(m_type == FLOAT);
+    const size_t floatsize = sizeof(float);
+    size_t xstep = m_native_depth ? floatsize : 1; // 4 bytes if native depth (FLOAT), otherwise converting to 1 byte U8 depth
    size_t ystep = 0;
-
-    xstep = m_native_depth ? 4 : 1;
+    const int channelstoread = ( (m_iscolor && alphasupported) ? 4 :
+                                ( (m_iscolor && !m_ischroma) || color) ? 3 : alphasupported ? 2 : 1 ); // number of channels to read may exceed channels in output img
+    size_t xStride = floatsize * channelstoread;

    AutoBuffer<char> copy_buffer;

    if( !justcopy )
    {
-        copy_buffer.allocate(sizeof(float) * m_width * 3);
+        copy_buffer.allocate(floatsize * m_width * defaultchannels);
        buffer = copy_buffer.data();
        ystep = 0;
    }
@ -215,49 +226,49 @@ bool  ExrDecoder::readData( Mat& img )
            if( m_blue )
            {
                frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-                xsample[0] = m_blue->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+                xsample[0] = m_blue->xSampling;
            }
            else
            {
                frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, 1, 1, 0.0 ));
            }
            if( m_green )
            {
                frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-                xsample[1] = m_green->ySampling;
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+                xsample[1] = m_green->xSampling;
            }
            else
            {
                frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, 1, 1, 0.0 ));
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, 1, 1, 0.0 ));
            }
            if( m_red )
            {
                frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-                xsample[2] = m_red->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+                xsample[2] = m_red->xSampling;
            }
            else
            {
                frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, 1, 1, 0.0 ));
            }
        }
        else
        {
            frame.insert( "Y", Slice( m_type,
-                            buffer - m_datawindow.min.x * 4 - m_datawindow.min.y * ystep,
-                            4, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[0] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[0] = m_green->xSampling;
        }
    }
    else
@ -265,67 +276,85 @@ bool  ExrDecoder::readData( Mat& img )
        if( m_blue )
        {
            frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-            xsample[0] = m_blue->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+            xsample[0] = m_blue->xSampling;
        }
        else
        {
            frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, 1, 1, 0.0 ));
        }
        if( m_green )
        {
            frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[1] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[1] = m_green->xSampling;
        }
        else
        {
            frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, 1, 1, 0.0 ));
        }
        if( m_red )
        {
            frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-            xsample[2] = m_red->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+            xsample[2] = m_red->xSampling;
        }
        else
        {
            frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, 1, 1, 0.0 ));
        }
    }

+    if( justcopy && m_hasalpha && alphasupported )
+    { // alpha preserved only in justcopy scenario where alpha is desired (alphasupported)
+      // and present in original file (m_hasalpha)
+        CV_Assert(channelstoread == img.channels());
+        int offset = (channelstoread - 1) * floatsize;
+        frame.insert( "A", Slice( m_type,
+            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + offset,
+            xStride, ystep, m_alpha->xSampling, m_alpha->ySampling, 0.0 ));
+    }
+
    for (FrameBuffer::Iterator it = frame.begin(); it != frame.end(); it++) {
        channels++;
    }

+    CV_Assert(channels == channelstoread);
+
+    if( (channels != channelstoread) || (!justcopy && channels > defaultchannels) )
+    { // safety checking what ought to be true here
+        close();
+        return false;
+    }
+
    m_file->setFrameBuffer( frame );
    if( justcopy )
    {
        m_file->readPixels( m_datawindow.min.y, m_datawindow.max.y );

-        if( color )
+        if( m_iscolor )
        {
            if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSample( data, 3, step / xstep, xsample[0], m_blue->ySampling );
+                UpSample( data, channelstoread, step / xstep, m_blue->xSampling, m_blue->ySampling );
            if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSample( data + xstep, 3, step / xstep, xsample[1], m_green->ySampling );
+                UpSample( data + xstep, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );
            if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSample( data + 2 * xstep, 3, step / xstep, xsample[2], m_red->ySampling );
+                UpSample( data + 2 * xstep, channelstoread, step / xstep, m_red->xSampling, m_red->ySampling );
        }
        else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-            UpSample( data, 1, step / xstep, xsample[0], m_green->ySampling );
+            UpSample( data, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );

        if( chromatorgb )
-            ChromaToBGR( (float *)data, m_height, step / xstep );
+            ChromaToBGR( (float *)data, m_height, channelstoread, step / xstep );
    }
    else
    {
@ -347,7 +376,7 @@ bool  ExrDecoder::readData( Mat& img )
            else
            {
                if( chromatorgb )
-                    ChromaToBGR( (float *)buffer, 1, step );
+                    ChromaToBGR( (float *)buffer, 1, defaultchannels, step );

                if( m_type == FLOAT )
                {
@ -372,11 +401,11 @@ bool  ExrDecoder::readData( Mat& img )
        if( color )
        {
            if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSampleY( data, 3, step / xstep, m_blue->ySampling );
+                UpSampleY( data, defaultchannels, step / xstep, m_blue->ySampling );
            if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSampleY( data + xstep, 3, step / xstep, m_green->ySampling );
+                UpSampleY( data + xstep, defaultchannels, step / xstep, m_green->ySampling );
            if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSampleY( data + 2 * xstep, 3, step / xstep, m_red->ySampling );
+                UpSampleY( data + 2 * xstep, defaultchannels, step / xstep, m_red->ySampling );
        }
        else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
            UpSampleY( data, 1, step / xstep, m_green->ySampling );
@ -457,7 +486,7 @@ void  ExrDecoder::UpSampleY( uchar *data, int xstep, int ystep, int ysample )
 /**
 // algorithm from ImfRgbaYca.cpp
 */
-void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
+void  ExrDecoder::ChromaToBGR( float *data, int numlines, int xstep, int ystep )
 {
    for( int y = 0; y < numlines; y++ )
    {
@ -466,15 +495,15 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
            double b, Y, r;
            if( m_type == FLOAT )
            {
-                b = data[y * step + x * 3];
-                Y = data[y * step + x * 3 + 1];
-                r = data[y * step + x * 3 + 2];
+                b = data[y * ystep + x * xstep];
+                Y = data[y * ystep + x * xstep + 1];
+                r = data[y * ystep + x * xstep + 2];
            }
            else
            {
-                b = ((unsigned *)data)[y * step + x * 3];
-                Y = ((unsigned *)data)[y * step + x * 3 + 1];
-                r = ((unsigned *)data)[y * step + x * 3 + 2];
+                b = ((unsigned *)data)[y * ystep + x * xstep];
+                Y = ((unsigned *)data)[y * ystep + x * xstep + 1];
+                r = ((unsigned *)data)[y * ystep + x * xstep + 2];
            }
            r = (r + 1) * Y;
            b = (b + 1) * Y;
@ -482,18 +511,18 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )

            if( m_type == FLOAT )
            {
-                data[y * step + x * 3] = (float)b;
-                data[y * step + x * 3 + 1] = (float)Y;
-                data[y * step + x * 3 + 2] = (float)r;
+                data[y * ystep + x * xstep] = (float)b;
+                data[y * ystep + x * xstep + 1] = (float)Y;
+                data[y * ystep + x * xstep + 2] = (float)r;
            }
            else
            {
                int t = cvRound(b);
-                ((unsigned *)data)[y * step + x * 3 + 0] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 0] = (unsigned)MAX(t, 0);
                t = cvRound(Y);
-                ((unsigned *)data)[y * step + x * 3 + 1] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 1] = (unsigned)MAX(t, 0);
                t = cvRound(r);
-                ((unsigned *)data)[y * step + x * 3 + 2] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 2] = (unsigned)MAX(t, 0);
            }
        }
    }
@ -571,7 +600,6 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
    int depth = img.depth();
    CV_Assert( depth == CV_32F );
    int channels = img.channels();
-    CV_Assert( channels == 3 || channels == 1 );
    bool result = false;
    Header header( width, height );
    Imf::PixelType type = FLOAT;
@ -632,7 +660,7 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
        }
    }

-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
    {
        header.channels().insert( "R", Channel( type ) );
        header.channels().insert( "G", Channel( type ) );
@ -645,6 +673,11 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
        //printf("gray\n");
    }

+    if( channels % 2 == 0 )
+    { // even number of channels indicates Alpha
+        header.channels().insert( "A", Channel( type ) );
+    }
+
    OutputFile file( m_filename.c_str(), header );

    FrameBuffer frame;
@ -667,14 +700,19 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
        size = 4;
    }

-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
    {
-        frame.insert( "B", Slice( type, buffer, size * 3, bufferstep ));
-        frame.insert( "G", Slice( type, buffer + size, size * 3, bufferstep ));
-        frame.insert( "R", Slice( type, buffer + size * 2, size * 3, bufferstep ));
+        frame.insert( "B", Slice( type, buffer, size * channels, bufferstep ));
+        frame.insert( "G", Slice( type, buffer + size, size * channels, bufferstep ));
+        frame.insert( "R", Slice( type, buffer + size * 2, size * channels, bufferstep ));
    }
    else
-        frame.insert( "Y", Slice( type, buffer, size, bufferstep ));
+        frame.insert( "Y", Slice( type, buffer, size * channels, bufferstep ));
+
+    if( channels % 2 == 0 )
+    { // even channel count indicates Alpha channel
+        frame.insert( "A", Slice( type, buffer + size * (channels - 1), size * channels, bufferstep ));
+    }

    file.setFrameBuffer( frame );

--- a/modules/imgcodecs/src/grfmt_exr.hpp
+++ b/modules/imgcodecs/src/grfmt_exr.hpp
@ -81,7 +81,7 @@ protected:
    void  UpSample( uchar *data, int xstep, int ystep, int xsample, int ysample );
    void  UpSampleX( float *data, int xstep, int xsample );
    void  UpSampleY( uchar *data, int xstep, int ystep, int ysample );
-    void  ChromaToBGR( float *data, int numlines, int step );
+    void  ChromaToBGR( float *data, int numlines, int xstep, int ystep );
    void  RGBToGray( float *in, float *out );

    InputFile      *m_file;
@ -91,11 +91,13 @@ protected:
    const Channel  *m_red;
    const Channel  *m_green;
    const Channel  *m_blue;
+    const Channel  *m_alpha;
    Chromaticities  m_chroma;
    int             m_bit_depth;
    bool            m_native_depth;
    bool            m_iscolor;
    bool            m_isfloat;
+    bool            m_hasalpha;

 private:
    ExrDecoder(const ExrDecoder &); // copy disabled
--- a/modules/imgcodecs/test/test_exr.impl.hpp
+++ b/modules/imgcodecs/test/test_exr.impl.hpp
@ -18,7 +18,7 @@ size_t getFileSize(const string& filename)
 }

 TEST(Imgcodecs_EXR, readWrite_32FC1)
-{
+{ // Y channels
    const string root = cvtest::TS::ptr()->get_data_path();
    const string filenameInput = root + "readwrite/test32FC1.exr";
    const string filenameOutput = cv::tempfile(".exr");
@ -44,7 +44,7 @@ TEST(Imgcodecs_EXR, readWrite_32FC1)
 }

 TEST(Imgcodecs_EXR, readWrite_32FC3)
-{
+{ // RGB channels
    const string root = cvtest::TS::ptr()->get_data_path();
    const string filenameInput = root + "readwrite/test32FC3.exr";
    const string filenameOutput = cv::tempfile(".exr");
@ -150,5 +150,159 @@ TEST(Imgcodecs_EXR, readWrite_32FC1_PIZ)
    EXPECT_EQ(0, remove(filenameOutput.c_str()));
 }

+// Note: YC to GRAYSCALE (IMREAD_GRAYSCALE | IMREAD_ANYDEPTH)
+// outputs a black image,
+// as does Y to RGB (IMREAD_COLOR | IMREAD_ANYDEPTH).
+// This behavoir predates adding EXR alpha support issue
+// 16115.
+
+TEST(Imgcodecs_EXR, read_YA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    // Writing Y covered by test 32FC1
+}
+
+TEST(Imgcodecs_EXR, read_YA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC2, img.type());
+
+    // Cannot test writing, 2 channel writing not suppported by loadsave
+}
+
+TEST(Imgcodecs_EXR, read_YC_changeDepth)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_8UC3, img.type());
+
+    // Cannot test writing, EXR encoder doesn't support 8U depth
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_YC_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing YC covered by test readwrite_YCA_ignore_alpha
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, readwrite_RGBA_togreyscale)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing RGB covered by test 32FC3
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+#ifndef GENERATE_DATA
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+#else
+    const Size sz(64, 32);
+    Mat img(sz, CV_32FC4, Scalar(0.5, 0.1, 1, 1));
+    img(Rect(10, 5, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 0, 0, 1));
+    img(Rect(10, 20, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 1, 0, 0));
+    ASSERT_TRUE(cv::imwrite(filenameInput, img));
+#endif
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}

 }} // namespace
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@ -235,9 +235,11 @@ vector<Vec3d> QRDetect::searchHorizontalLines()
 vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
 {
    CV_TRACE_FUNCTION();
-
-    for (int coeff_epsilon = 1; coeff_epsilon < 10; coeff_epsilon++)
+    const double min_dist_between_points = 10.0;
+    const double max_ratio = 1.0;
+    for (int coeff_epsilon_i = 1; coeff_epsilon_i < 101; ++coeff_epsilon_i)
    {
+        const float coeff_epsilon = coeff_epsilon_i * 0.1f;
        vector<Point2f> point2f_result = extractVerticalLines(list_lines, eps_horizontal * coeff_epsilon);
        if (!point2f_result.empty())
        {
@ -247,9 +249,23 @@ vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
                    point2f_result, 3, labels,
                    TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
                    3, KMEANS_PP_CENTERS, centers);
-            if (compactness == 0)
+            double min_dist = std::numeric_limits<double>::max();
+            for (size_t i = 0; i < centers.size(); i++)
+            {
+                double dist = norm(centers[i] - centers[(i+1) % centers.size()]);
+                if (dist < min_dist)
+                {
+                    min_dist = dist;
+                }
+            }
+            if (min_dist < min_dist_between_points)
+            {
                continue;
-            if (compactness > 0)
+            }
+            double mean_compactness = compactness / point2f_result.size();
+            double ratio = mean_compactness / min_dist;
+
+            if (ratio < max_ratio)
            {
                return point2f_result;
            }
@ -456,7 +472,6 @@ bool QRDetect::localization()
    vector<Point2f> list_lines_y = separateVerticalLines(list_lines_x);
    if( list_lines_y.empty() ) { return false; }

-    vector<Point2f> centers;
    Mat labels;
    kmeans(list_lines_y, 3, labels,
           TermCriteria( TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
@ -464,7 +479,7 @@ bool QRDetect::localization()

    fixationPoints(localization_points);

-    bool suare_flag = false, local_points_flag = false;
+    bool square_flag = false, local_points_flag = false;
    double triangle_sides[3];
    double triangle_perim, square_area, img_square_area;
    if (localization_points.size() == 3)
@ -482,14 +497,14 @@ bool QRDetect::localization()

        if (square_area > (img_square_area * 0.2))
        {
-            suare_flag = true;
+            square_flag = true;
        }
    }
    else
    {
        local_points_flag = true;
    }
-    if ((suare_flag || local_points_flag) && purpose == SHRINKING)
+    if ((square_flag || local_points_flag) && purpose == SHRINKING)
    {
        localization_points.clear();
        bin_barcode = resized_bin_barcode.clone();
@ -1962,6 +1977,13 @@ bool QRDecode::createSpline(vector<vector<Point2f> > &spline_lines)
            }
        }
    }
+    for (int i = 0; i < NUM_SIDES; i++)
+    {
+        if (spline_lines[i].size() == 0)
+        {
+            return false;
+        }
+    }
    return true;
 }