From 53652a6194a18715ba9bad4099541d1261900870 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Fri, 26 Mar 2021 12:11:49 +0000
Subject: [PATCH 01/11] Relax accuracy requirement on OpenCL MinEigenVal corner
 kernel test.

The MinEigenVal path through the corner.cl kernel makes use of native_sqrt,
a math builtin function which has implementation defined accuracy.

Partially addresses issue #9821
---
 modules/imgproc/test/ocl/test_imgproc.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/modules/imgproc/test/ocl/test_imgproc.cpp b/modules/imgproc/test/ocl/test_imgproc.cpp
index f5a3fef89c..35185d339f 100644
--- a/modules/imgproc/test/ocl/test_imgproc.cpp
+++ b/modules/imgproc/test/ocl/test_imgproc.cpp
@@ -234,7 +234,10 @@ OCL_TEST_P(CornerMinEigenVal, Mat)
         OCL_OFF(cv::cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType));
         OCL_ON(cv::cornerMinEigenVal(usrc_roi, udst_roi, blockSize, apertureSize, borderType));
 
-        Near(1e-5, true);
+        if (ocl::Device::getDefault().isIntel())
+          Near(1e-5, true);
+        else
+          Near(0.1, true);  // using native_* OpenCL functions may lose accuracy
     }
 }
 

From aab62aa6dd70c7ad9aae8d2c5fc8cc6f6e079492 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <vitaly.tuzov@intel.com>
Date: Tue, 30 Mar 2021 19:18:03 +0300
Subject: [PATCH 02/11] Merge pull request #18952 from terfendail:wui_doc

* Updated UI documentation to address WUI

* Added documentation for vx_ calls

* Removed vx_store operation overload

* Doxyfile updated to enable wide UI

* Enable doxygen documentation for vx_ WUI functions

* Wide intrinsics definition rework

* core: fix SIMD C++ emulator build (supports 128-bit only)
---
 doc/Doxyfile.in                               |   6 +
 .../core/include/opencv2/core/hal/intrin.hpp  | 310 ++++++--
 .../include/opencv2/core/hal/intrin_cpp.hpp   | 741 ++++++++++++++++--
 modules/core/src/matrix_transform.cpp         |   8 +-
 modules/core/test/test_intrin_utils.hpp       |   2 +-
 modules/dnn/src/layers/convolution_layer.cpp  |   2 +-
 6 files changed, 912 insertions(+), 157 deletions(-)

diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index b6a7c7a82f..02dd372660 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -249,6 +249,12 @@ PREDEFINED             = __cplusplus=1 \
                          CV_DEFAULT(x)=" = x" \
                          CV_NEON=1 \
                          CV_SSE2=1 \
+                         CV_SIMD128=1 \
+                         CV_SIMD256=1 \
+                         CV_SIMD512=1 \
+                         CV_SIMD128_64F=1 \
+                         CV_SIMD256_64F=1 \
+                         CV_SIMD512_64F=1 \
                          CV__DEBUG_NS_BEGIN= \
                          CV__DEBUG_NS_END= \
                          CV_DEPRECATED_EXTERNAL= \
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 52f6b5d552..13228380ce 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -104,7 +104,7 @@ template<typename _Tp> struct V_TypeTraits
 {
 };
 
-#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
     template<> struct V_TypeTraits<type> \
     { \
         typedef type value_type; \
@@ -114,7 +114,6 @@ template<typename _Tp> struct V_TypeTraits
         typedef w_type_ w_type; \
         typedef q_type_ q_type; \
         typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
     \
         static inline int_type reinterpret_int(type x) \
         { \
@@ -131,7 +130,7 @@ template<typename _Tp> struct V_TypeTraits
         } \
     }
 
-#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
     template<> struct V_TypeTraits<type> \
     { \
         typedef type value_type; \
@@ -140,7 +139,6 @@ template<typename _Tp> struct V_TypeTraits
         typedef uint_type_ uint_type; \
         typedef w_type_ w_type; \
         typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
     \
         static inline int_type reinterpret_int(type x) \
         { \
@@ -157,16 +155,16 @@ template<typename _Tp> struct V_TypeTraits
         } \
     }
 
-CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
-CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
 
 #ifndef CV_DOXYGEN
 
@@ -310,54 +308,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 //==================================================================================================
 
-#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
-    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
-    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
-    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
-    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
-    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
-    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
-    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
-    inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
-    inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
-
 template<typename _Tp> struct V_RegTraits
 {
 };
@@ -417,6 +367,7 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
     CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
 #endif
+//! @endcond
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
@@ -425,21 +376,33 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD_64F CV_SIMD512_64F
     #define CV_SIMD_FP16 CV_SIMD512_FP16
     #define CV_SIMD_WIDTH 64
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x64    v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x64     v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x32   v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x32    v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x16   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x16    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x8    v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x8     v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x16  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
-#if CV_SIMD512_64F
+    #if CV_SIMD512_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x8   v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
-#endif
-        inline void vx_cleanup() { v512_cleanup(); }
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v512##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
@@ -449,21 +412,33 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD_64F CV_SIMD256_64F
     #define CV_SIMD_FP16 CV_SIMD256_FP16
     #define CV_SIMD_WIDTH 32
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x32   v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x32    v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x16  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x16   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x8   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x8    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x4   v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x4    v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x8  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
     #if CV_SIMD256_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x4  v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
     #endif
-    inline void vx_cleanup() { v256_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v256##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
@@ -476,25 +451,214 @@ namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD CV_SIMD128
     #define CV_SIMD_64F CV_SIMD128_64F
     #define CV_SIMD_WIDTH 16
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
     typedef v_uint8x16  v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
     typedef v_int8x16   v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
     typedef v_uint16x8  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
     typedef v_int16x8   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
     typedef v_uint32x4  v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
     typedef v_int32x4   v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
     typedef v_uint64x2  v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
     typedef v_int64x2   v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
     typedef v_float32x4 v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
     #if CV_SIMD128_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
     typedef v_float64x2 v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
     #endif
-    inline void vx_cleanup() { v_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #endif
 
+namespace CV__SIMD_NAMESPACE {
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @name Wide init with value
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to a specific value
+    inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
+    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
+    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
+    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
+    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
+    inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
+    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
+    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
+    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
+#endif
+    //! @}
+
+    //! @name Wide init with zero
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to zero
+    inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
+    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
+    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
+    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
+    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
+    inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
+    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
+    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
+    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory
+    inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory(aligned)
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory(aligned)
+    inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load lower half from memory
+    //! @{
+    //! @brief Load lower half of maximum available capacity register from memory
+    inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load halfs from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from two memory blocks
+    inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of elements
+    //! @{
+    //! @brief Load maximum available capacity register contents with array elements by provided indexes
+    inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element pairs
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
+    inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element quads
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element quads by provided indexes
+    inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    //! @}
+
+    //! @name Wide load with double expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with double expand
+    inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    //! @}
+
+    //! @name Wide load with quad expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with quad expand
+    inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    //! @}
+
+    /** @brief SIMD processing state cleanup call */
+    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+//! @}
+    #undef VXPREFIX
+} // namespace
+
+//! @cond IGNORED
 #ifndef CV_SIMD_64F
 #define CV_SIMD_64F 0
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 5878dced7f..46222140e6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -52,10 +52,21 @@
 
 //! @cond IGNORED
 #define CV_SIMD128_CPP 1
-#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN)
+#if defined(CV_FORCE_SIMD128_CPP)
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1
 #endif
+#if defined(CV_DOXYGEN)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#else
+#define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
+#define CV_SIMD512 0 // to avoid warnings during compilation
+#endif
 //! @endcond
 
 namespace cv
@@ -68,17 +79,33 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 /** @addtogroup core_hal_intrin
 
 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
-different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
-architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
-containing packed values of different types. In case when there is no SIMD extension available
-during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
-expected although it could be slower.
+different platforms. Currently a few different SIMD extensions on different architectures are supported.
+128 bit registers of various types support is implemented for a wide range of architectures
+including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
+256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
+In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
+will be chosen and code will work as expected although it could be slower.
 
 ### Types
 
-There are several types representing 128-bit register as a vector of packed values, each type is
+There are several types representing packed values vector registers, each type is
 implemented as a structure based on a one SIMD register.
 
+- cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32: 32-bit floating point values (signed) - float
+- cv::v_float64: 64-bit floating point values (signed) - double
+
+Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
+capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
+to check for exact value quantity of the type.
+
+In case the exact bit length of the type is important it is possible to use specific fixed length register types.
+
+There are several types representing 128-bit registers.
+
 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
@@ -86,28 +113,96 @@ implemented as a structure based on a one SIMD register.
 - cv::v_float32x4: four 32-bit floating point values (signed) - float
 - cv::v_float64x2: two 64-bit floating point values (signed) - double
 
+There are several types representing 256-bit registers.
+
+- cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x8: eight 32-bit floating point values (signed) - float
+- cv::v_float64x4: four 64-bit floating point values (signed) - double
+
 @note
-cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
-check the CV_SIMD128_64F preprocessor definition:
+256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD256 preprocessor definition:
 @code
-#if CV_SIMD128_64F
+#if CV_SIMD256
 //...
 #endif
 @endcode
 
+There are several types representing 512-bit registers.
+
+- cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
+- cv::v_float64x8: eight 64-bit floating point values (signed) - double
+@note
+512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD512 preprocessor definition.
+
+@note
+cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
+check the CV_SIMD128_64F preprocessor definition.
+
 ### Load and store operations
 
 These operations allow to set contents of the register explicitly or by loading it from some memory
 block and to save contents of the register to memory block.
 
+There are variable size register load operations that provide result of maximum available size
+depending on chosen platform capabilities.
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+- Other create methods:
+vx_setall_s8, vx_setall_u8, ...,
+vx_setzero_u8, vx_setzero_s8, ...
+- Memory load operations:
+vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
+- Memory operations with expansion of values:
+vx_load_expand, vx_load_expand_q
+
+Also there are fixed size register load/store operations.
+
+For 128 bit registers
 - Constructors:
 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
 - Other create methods:
 @ref v_setall_s8, @ref v_setall_u8, ...,
 @ref v_setzero_u8, @ref v_setzero_s8, ...
-- Memory operations:
+- Memory load operations:
 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
+- Memory operations with expansion of values:
+@ref v_load_expand, @ref v_load_expand_q
+
+For 256 bit registers(check CV_SIMD256 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
+- Other create methods:
+@ref v256_setall_s8, @ref v256_setall_u8, ...,
+@ref v256_setzero_u8, @ref v256_setzero_s8, ...
+- Memory load operations:
+@ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
+- Memory operations with expansion of values:
+@ref v256_load_expand, @ref v256_load_expand_q
+
+For 512 bit registers(check CV_SIMD512 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
+- Other create methods:
+@ref v512_setall_s8, @ref v512_setall_u8, ...,
+@ref v512_setzero_u8, @ref v512_setzero_s8, ...
+- Memory load operations:
+@ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
+- Memory operations with expansion of values:
+@ref v512_load_expand, @ref v512_load_expand_q
+
+Store to memory operations are similar across different platform capabilities:
 @ref v_store, @ref v_store_aligned,
 @ref v_store_high, @ref v_store_low
 
@@ -116,7 +211,7 @@ block and to save contents of the register to memory block.
 These operations allow to reorder or recombine elements in one or multiple vectors.
 
 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
-- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
+- Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
@@ -153,7 +248,7 @@ Element-wise binary and unary operations.
 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
 @ref operator <(const v_reg &a, const v_reg &b) "<",
 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
-@ref operator==(const v_reg &a, const v_reg &b) "==",
+@ref operator ==(const v_reg &a, const v_reg &b) "==",
 @ref operator !=(const v_reg &a, const v_reg &b) "!="
 
 - min/max: @ref v_min, @ref v_max
@@ -190,7 +285,7 @@ shows the applicability of different operations to the types.
 
 Regular integers:
 
-| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
+| Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
 |load, store        | x | x | x | x | x | x |
 |interleave         | x | x | x | x | x | x |
@@ -230,7 +325,7 @@ Regular integers:
 
 Big integers:
 
-| Operations\\Types | uint 64x2 | int 64x2 |
+| Operations\\Types | uint 64 | int 64 |
 |-------------------|:-:|:-:|
 |load, store        | x | x |
 |add, sub           | x | x |
@@ -244,7 +339,7 @@ Big integers:
 
 Floating point:
 
-| Operations\\Types | float 32x4 | float 64x2 |
+| Operations\\Types | float 32 | float 64 |
 |-------------------|:-:|:-:|
 |load, store        | x | x |
 |interleave         | x |   |
@@ -410,6 +505,67 @@ typedef v_reg<uint64, 2> v_uint64x2;
 /** @brief Two 64-bit signed integer values */
 typedef v_reg<int64, 2> v_int64x2;
 
+#if CV_SIMD256
+/** @brief Thirty two 8-bit unsigned integer values */
+typedef v_reg<uchar, 32> v_uint8x32;
+/** @brief Thirty two 8-bit signed integer values */
+typedef v_reg<schar, 32> v_int8x32;
+/** @brief Sixteen 16-bit unsigned integer values */
+typedef v_reg<ushort, 16> v_uint16x16;
+/** @brief Sixteen 16-bit signed integer values */
+typedef v_reg<short, 16> v_int16x16;
+/** @brief Eight 32-bit unsigned integer values */
+typedef v_reg<unsigned, 8> v_uint32x8;
+/** @brief Eight 32-bit signed integer values */
+typedef v_reg<int, 8> v_int32x8;
+/** @brief Eight 32-bit floating point values (single precision) */
+typedef v_reg<float, 8> v_float32x8;
+/** @brief Four 64-bit floating point values (double precision) */
+typedef v_reg<double, 4> v_float64x4;
+/** @brief Four 64-bit unsigned integer values */
+typedef v_reg<uint64, 4> v_uint64x4;
+/** @brief Four 64-bit signed integer values */
+typedef v_reg<int64, 4> v_int64x4;
+#endif
+
+#if CV_SIMD512
+/** @brief Sixty four 8-bit unsigned integer values */
+typedef v_reg<uchar, 64> v_uint8x64;
+/** @brief Sixty four 8-bit signed integer values */
+typedef v_reg<schar, 64> v_int8x64;
+/** @brief Thirty two 16-bit unsigned integer values */
+typedef v_reg<ushort, 32> v_uint16x32;
+/** @brief Thirty two 16-bit signed integer values */
+typedef v_reg<short, 32> v_int16x32;
+/** @brief Sixteen 32-bit unsigned integer values */
+typedef v_reg<unsigned, 16> v_uint32x16;
+/** @brief Sixteen 32-bit signed integer values */
+typedef v_reg<int, 16> v_int32x16;
+/** @brief Sixteen 32-bit floating point values (single precision) */
+typedef v_reg<float, 16> v_float32x16;
+/** @brief Eight 64-bit floating point values (double precision) */
+typedef v_reg<double, 8> v_float64x8;
+/** @brief Eight 64-bit unsigned integer values */
+typedef v_reg<uint64, 8> v_uint64x8;
+/** @brief Eight 64-bit signed integer values */
+typedef v_reg<int64, 8> v_int64x8;
+#endif
+
+enum {
+    simd128_width = 16,
+#if CV_SIMD256
+    simd256_width = 32,
+#endif
+#if CV_SIMD512
+    simd512_width = 64,
+    simdmax_width = simd512_width
+#elif CV_SIMD256
+    simdmax_width = simd256_width
+#else
+    simdmax_width = simd128_width
+#endif
+};
+
 /** @brief Add values
 
 For all types. */
@@ -1421,30 +1577,116 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
 
 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
 
+@note Use vx_load version to get maximum available register length result
+
 @note Alignment requirement:
 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
 }
 
+#if CV_SIMD256
+/** @brief Load 256-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 512-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
 /** @brief Load register contents from memory (aligned)
 
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
- */
+
+@note Use vx_load_aligned version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
 {
-    CV_Assert(isAligned<sizeof(v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>)>(ptr));
-    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
 /** @brief Load 64-bits of data to lower part (high part is undefined).
 
 @param ptr memory block containing data for first half (0..n/2)
@@ -1453,14 +1695,16 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 int lo[2] = { 1, 2 };
 v_int32x4 r = v_load_low(lo);
 @endcode
- */
+
+@note Use vx_load_low version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = ptr[i];
@@ -1468,6 +1712,62 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load 128-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 };
+v_int32x8 r = v256_load_low(lo);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 256-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v_int32x16 r = v512_load_low(lo);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from two memory blocks
 
 @param loptr memory block containing data for first half (0..n/2)
@@ -1477,15 +1777,17 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
 v_int32x4 r = v_load_halves(lo, hi);
 @endcode
- */
+
+@note Use vx_load_halves version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(loptr));
     CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
 #endif
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = loptr[i];
@@ -1494,6 +1796,68 @@ inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr,
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
+v_int32x8 r = v256_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
+v_int32x16 r = v512_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from memory with double expand
 
 Same as cv::v_load, but result pack type will be 2x wider than memory type.
@@ -1502,16 +1866,19 @@ Same as cv::v_load, but result pack type will be 2x wider than memory type.
 short buf[4] = {1, 2, 3, 4}; // type is int16
 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
-For 8-, 16-, 32-bit integer source types. */
+For 8-, 16-, 32-bit integer source types.
+
+@note Use vx_load_expand version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
 v_load_expand(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
+    v_reg<w_type, simd128_width / sizeof(w_type)> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1519,23 +1886,88 @@ v_load_expand(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v256_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
+v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v256_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd256_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v512_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
+v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v512_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd512_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load register contents from memory with quad expand
 
 Same as cv::v_load_expand, but result type is 4 times wider than source.
 @code{.cpp}
 char buf[4] = {1, 2, 3, 4}; // type is int8
-v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
+v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
-For 8-bit integer source types. */
+For 8-bit integer source types.
+
+@note Use vx_load_expand_q version to get maximum available register length result
+*/
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
 v_load_expand_q(const _Tp* ptr)
 {
 #if CV_STRONG_ALIGNMENT
     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
 #endif
     typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
+    v_reg<q_type, simd128_width / sizeof(q_type)> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1543,6 +1975,66 @@ v_load_expand_q(const _Tp* ptr)
     return c;
 }
 
+#if CV_SIMD256
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v256_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
+v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v256_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd256_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v512_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
+v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v512_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd512_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
 /** @brief Load and deinterleave (2 channels)
 
 Load data from memory deinterleave and store to 2 registers.
@@ -2041,7 +2533,7 @@ template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 
 /** @brief Convert to float
 
-Supported input type is cv::v_int32x4. */
+Supported input type is cv::v_int32. */
 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
 {
     v_reg<float, n> c;
@@ -2050,6 +2542,9 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
     return c;
 }
 
+/** @brief Convert lower half to float
+
+Supported input type is cv::v_float64. */
 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
 {
     v_reg<float, n*2> c;
@@ -2061,6 +2556,9 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
     return c;
 }
 
+/** @brief Convert to float
+
+Supported input type is cv::v_float64. */
 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
 {
     v_reg<float, n*2> c;
@@ -2072,9 +2570,9 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, co
     return c;
 }
 
-/** @brief Convert to double
+/** @brief Convert lower half to double
 
-Supported input type is cv::v_int32x4. */
+Supported input type is cv::v_int32. */
 template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
 {
     v_reg<double, (n/2)> c;
@@ -2085,7 +2583,7 @@ template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
 
 /** @brief Convert to double high part of vector
 
-Supported input type is cv::v_int32x4. */
+Supported input type is cv::v_int32. */
 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
 {
     v_reg<double, (n/2)> c;
@@ -2094,9 +2592,9 @@ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n
     return c;
 }
 
-/** @brief Convert to double
+/** @brief Convert lower half to double
 
-Supported input type is cv::v_float32x4. */
+Supported input type is cv::v_float32. */
 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
 {
     v_reg<double, (n/2)> c;
@@ -2107,7 +2605,7 @@ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>&
 
 /** @brief Convert to double high part of vector
 
-Supported input type is cv::v_float32x4. */
+Supported input type is cv::v_float32. */
 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
 {
     v_reg<double, (n/2)> c;
@@ -2118,7 +2616,7 @@ template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float,
 
 /** @brief Convert to double
 
-Supported input type is cv::v_int64x2. */
+Supported input type is cv::v_int64. */
 template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
 {
     v_reg<double, n> c;
@@ -2128,24 +2626,24 @@ template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
 }
 
 
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i]];
     return c;
 }
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i / 2] + i % 2];
     return c;
 }
-template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
 {
-    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
-    for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
         c.s[i] = tab[idx[i / 4] + i % 4];
     return c;
 }
@@ -2283,42 +2781,94 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
+#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
 
 //! @name Init with zero
 //! @{
 //! @brief Create new vector with zero elements
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
+#endif
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
 
 //! @name Init with value
 //! @{
 //! @brief Create new vector with elements set to a specific value
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
+#endif
 //! @}
 
 //! @brief Helper macro
@@ -2703,16 +3253,40 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
 
 ////// FP16 support ///////
 
-inline v_reg<float, V_TypeTraits<float>::nlanes128>
+inline v_reg<float, simd128_width / sizeof(float)>
 v_load_expand(const float16_t* ptr)
 {
-    v_reg<float, V_TypeTraits<float>::nlanes128> v;
+    v_reg<float, simd128_width / sizeof(float)> v;
     for( int i = 0; i < v.nlanes; i++ )
     {
         v.s[i] = ptr[i];
     }
     return v;
 }
+#if CV_SIMD256
+inline v_reg<float, simd256_width / sizeof(float)>
+v256_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd256_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+#if CV_SIMD512
+inline v_reg<float, simd512_width / sizeof(float)>
+v512_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd512_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
 
 template<int n> inline void
 v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
@@ -2724,6 +3298,12 @@ v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
 }
 
 inline void v_cleanup() {}
+#if CV_SIMD256
+inline void v256_cleanup() {}
+#endif
+#if CV_SIMD512
+inline void v512_cleanup() {}
+#endif
 
 //! @}
 
@@ -2732,4 +3312,9 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif
 }
 
+#if !defined(CV_DOXYGEN)
+#undef CV_SIMD256
+#undef CV_SIMD512
+#endif
+
 #endif
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
index 37bc273b4d..727eaf7fee 100644
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@@ -536,8 +536,8 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
             {
                 v_int32 t0 = vx_load((int*)(src0 + i));
                 v_int32 t1 = vx_load((int*)(src1 + i));
-                vx_store((int*)(dst0 + i), t1);
-                vx_store((int*)(dst1 + i), t0);
+                v_store((int*)(dst0 + i), t1);
+                v_store((int*)(dst1 + i), t0);
             }
         }
 #if CV_STRONG_ALIGNMENT
@@ -547,8 +547,8 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
             {
                 v_uint8 t0 = vx_load(src0 + i);
                 v_uint8 t1 = vx_load(src1 + i);
-                vx_store(dst0 + i, t1);
-                vx_store(dst1 + i, t0);
+                v_store(dst0 + i, t1);
+                v_store(dst1 + i, t0);
             }
         }
 #endif
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 84da496b42..269ebe0f2a 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1466,7 +1466,7 @@ template<typename R> struct TheTest
         R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
         R r2(r1);
         EXPECT_EQ(1.0f, r1.get0());
-        vx_store(data_f32.a.d, r2);
+        v_store(data_f32.a.d, r2);
         EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
 
         out.a.clear();
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index eeb9f73f5d..edbd2baefb 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -1023,7 +1023,7 @@ public:
                                                              v20*vw20 + v21*vw21 + v22*vw22 + vbias;
                                             if (relu)
                                                 vout = v_select(vout > z, vout, vout*vrc);
-                                            vx_store(outptr + out_j, vout);
+                                            v_store(outptr + out_j, vout);
                                         }
                                     }
                                 #endif

From cc6d48959eba1ee99a702ae34aab05835fa48e12 Mon Sep 17 00:00:00 2001
From: Anastasia Murzova <anastasia.murzova@xperience.ai>
Date: Sun, 28 Mar 2021 16:53:44 +0300
Subject: [PATCH 03/11] Added reduce sum by channel support

---
 modules/dnn/src/tensorflow/tf_importer.cpp | 39 +++++++++++++++++++---
 modules/dnn/test/test_tf_importer.cpp      | 10 ++++++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index a0fcffff2e..084e4ac6da 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -2360,12 +2360,9 @@ void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
                         // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
                         LayerParams permLP;
                         int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
                         std::string permName = name + "/nchw";
-                        CV_Assert(layer_id.find(permName) == layer_id.end());
-                        int permId = dstNet.addLayer(permName, "Permute", permLP);
-                        layer_id[permName] = permId;
-                        connect(layer_id, dstNet, Pin(name), permId, 0);
+                        Pin inpId = Pin(name);
+                        addPermuteLayer(order, permName, inpId);
 
                         LayerParams squeezeLp;
                         std::string squeezeName = name + "/squeeze";
@@ -2377,6 +2374,38 @@ void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
                         connect(layer_id, dstNet, Pin(permName), squeezeId, 0);
                     }
                 }
+                else if (axis == 1)
+                {
+                    int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                    Pin inpId = parsePin(layer.input(0));
+                    addPermuteLayer(order, name + "/nhwc", inpId);
+
+                    layerParams.set("pool", type == "Mean" ? "ave" : "sum");
+                    layerParams.set("kernel_h", 1);
+                    layerParams.set("global_pooling_w", true);
+                    int id = dstNet.addLayer(name, "Pooling", layerParams);
+                    layer_id[name] = id;
+                    connect(layer_id, dstNet, inpId, id, 0);
+
+                    if (!keepDims)
+                    {
+                        LayerParams squeezeLp;
+                        std::string squeezeName = name + "/squeeze";
+                        CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                        int channel_id = 3; // TF NHWC layout
+                        squeezeLp.set("axis", channel_id - 1);
+                        squeezeLp.set("end_axis", channel_id);
+                        int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+                        layer_id[squeezeName] = squeezeId;
+                        connect(layer_id, dstNet, Pin(name), squeezeId, 0);
+                    }
+                    else
+                    {
+                        int order[] = {0, 3, 1, 2};  // From NHWC to OpenCV's NCHW.
+                        Pin inpId = parsePin(name);
+                        addPermuteLayer(order, name + "/nchw", inpId);
+                    }
+                }
             } else {
                 if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
                     CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean or reduce_sum operation.");
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index ff9360c600..4ba4f29322 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -135,6 +135,16 @@ TEST_P(Test_TensorFlow_layers, reduce_sum)
     runTensorFlowNet("sum_pool_by_axis");
 }
 
+TEST_P(Test_TensorFlow_layers, reduce_sum_channel)
+{
+    runTensorFlowNet("reduce_sum_channel");
+}
+
+TEST_P(Test_TensorFlow_layers, reduce_sum_channel_keep_dims)
+{
+    runTensorFlowNet("reduce_sum_channel", false, 0.0, 0.0, false, "_keep_dims");
+}
+
 TEST_P(Test_TensorFlow_layers, conv_single_conv)
 {
     runTensorFlowNet("single_conv");

From bb6e15f2c09dab952404793ad51e2afdf677f254 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 30 Mar 2021 20:54:11 +0000
Subject: [PATCH 04/11] python: fix CV_WRAP_AS handling

---
 modules/python/src2/cv2.cpp       |  8 +++----
 modules/python/src2/gen2.py       | 35 ++++++++++++++++++++++++-------
 modules/python/src2/hdr_parser.py |  8 ++++++-
 modules/python/src2/pycompat.hpp  | 20 +++++++++---------
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 8bb15cd43f..f96bcc4458 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -2065,9 +2065,9 @@ static int convert_to_char(PyObject *o, char *dst, const ArgInfo& info)
 #include "pyopencv_custom_headers.h"
 
 #ifdef CVPY_DYNAMIC_INIT
-#define CVPY_TYPE(NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE_DYNAMIC(NAME, STORAGE, SNAME)
+#define CVPY_TYPE(WNAME, NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE_DYNAMIC(WNAME, NAME, STORAGE, SNAME)
 #else
-#define CVPY_TYPE(NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE(NAME, STORAGE, SNAME)
+#define CVPY_TYPE(WNAME, NAME, STORAGE, SNAME, _1, _2) CVPY_TYPE_DECLARE(WNAME, NAME, STORAGE, SNAME)
 #endif
 #include "pyopencv_generated_types.h"
 #undef CVPY_TYPE
@@ -2150,10 +2150,10 @@ static bool init_body(PyObject * m)
 #undef CVPY_MODULE
 
 #ifdef CVPY_DYNAMIC_INIT
-#define CVPY_TYPE(NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_DYNAMIC(NAME, return false, BASE, CONSTRUCTOR)
+#define CVPY_TYPE(WNAME, NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_DYNAMIC(WNAME, NAME, return false, BASE, CONSTRUCTOR)
     PyObject * pyopencv_NoBase_TypePtr = NULL;
 #else
-#define CVPY_TYPE(NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_STATIC(NAME, return false, BASE, CONSTRUCTOR)
+#define CVPY_TYPE(WNAME, NAME, _1, _2, BASE, CONSTRUCTOR) CVPY_TYPE_INIT_STATIC(WNAME, NAME, return false, BASE, CONSTRUCTOR)
     PyTypeObject * pyopencv_NoBase_TypePtr = NULL;
 #endif
     #include "pyopencv_generated_types.h"
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 4acca07ada..0579b42a5f 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -265,7 +265,12 @@ class ClassInfo(object):
 
             for m in decl[2]:
                 if m.startswith("="):
-                    self.wname = m[1:]
+                    wname = m[1:]
+                    npos = name.rfind('.')
+                    if npos >= 0:
+                        self.wname = normalize_class_name(name[:npos] + '.' + wname)
+                    else:
+                        self.wname = wname
                     customname = True
                 elif m == "/Map":
                     self.ismap = True
@@ -344,7 +349,8 @@ class ClassInfo(object):
         if self.constructor is not None:
             constructor_name = self.constructor.get_wrapper_name()
 
-        return "CVPY_TYPE({}, {}, {}, {}, {});\n".format(
+        return "CVPY_TYPE({}, {}, {}, {}, {}, {});\n".format(
+            self.wname,
             self.name,
             self.cname if self.issimple else "Ptr<{}>".format(self.cname),
             self.sname if self.issimple else "Ptr",
@@ -912,7 +918,7 @@ class PythonWrapperGenerator(object):
         if classes:
             classname = normalize_class_name('.'.join(namespace+classes))
             bareclassname = classes[-1]
-        namespace = '.'.join(namespace)
+        namespace_str = '.'.join(namespace)
 
         isconstructor = name == bareclassname
         is_static = False
@@ -937,23 +943,36 @@ class PythonWrapperGenerator(object):
         if is_static:
             # Add it as a method to the class
             func_map = self.classes[classname].methods
-            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace, is_static))
+            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace_str, is_static))
             func.add_variant(decl, isphantom)
 
             # Add it as global function
             g_name = "_".join(classes+[name])
-            func_map = self.namespaces.setdefault(namespace, Namespace()).funcs
-            func = func_map.setdefault(g_name, FuncInfo("", g_name, cname, isconstructor, namespace, False))
+            w_classes = []
+            for i in range(0, len(classes)):
+                classes_i = classes[:i+1]
+                classname_i = normalize_class_name('.'.join(namespace+classes_i))
+                w_classname = self.classes[classname_i].wname
+                namespace_prefix = normalize_class_name('.'.join(namespace)) + '_'
+                if w_classname.startswith(namespace_prefix):
+                    w_classname = w_classname[len(namespace_prefix):]
+                w_classes.append(w_classname)
+            g_wname = "_".join(w_classes+[name])
+            func_map = self.namespaces.setdefault(namespace_str, Namespace()).funcs
+            func = func_map.setdefault(g_name, FuncInfo("", g_name, cname, isconstructor, namespace_str, False))
             func.add_variant(decl, isphantom)
+            if g_wname != g_name:  # TODO OpenCV 5.0
+                wfunc = func_map.setdefault(g_wname, FuncInfo("", g_wname, cname, isconstructor, namespace_str, False))
+                wfunc.add_variant(decl, isphantom)
         else:
             if classname and not isconstructor:
                 if not isphantom:
                     cname = barename
                 func_map = self.classes[classname].methods
             else:
-                func_map = self.namespaces.setdefault(namespace, Namespace()).funcs
+                func_map = self.namespaces.setdefault(namespace_str, Namespace()).funcs
 
-            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace, is_static))
+            func = func_map.setdefault(name, FuncInfo(classname, name, cname, isconstructor, namespace_str, is_static))
             func.add_variant(decl, isphantom)
 
         if classname and isconstructor:
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index d8b04b43ce..ac3f383dc8 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -255,6 +255,8 @@ class CppHeaderParser(object):
             l = l.replace("CV_EXPORTS_W_SIMPLE", "")
             modlist.append("/Simple")
         npos = l.find("CV_EXPORTS_AS")
+        if npos < 0:
+            npos = l.find('CV_WRAP_AS')
         if npos >= 0:
             macro_arg, npos3 = self.get_macro_arg(l, npos)
             modlist.append("=" + macro_arg)
@@ -825,7 +827,11 @@ class CppHeaderParser(object):
                     continue
                 state = SCAN
                 l = re.sub(r'//(.+)?', '', l).strip()  # drop // comment
-                if l == '#if 0' or l == '#if defined(__OPENCV_BUILD)' or l == '#ifdef __OPENCV_BUILD':
+                if l in [
+                    '#if 0',
+                    '#if defined(__OPENCV_BUILD)', '#ifdef __OPENCV_BUILD',
+                    '#if !defined(OPENCV_BINDING_PARSER)', '#ifndef OPENCV_BINDING_PARSER',
+                ]:
                     state = DIRECTIVE_IF_0
                     depth_if_0 = 1
                 continue
diff --git a/modules/python/src2/pycompat.hpp b/modules/python/src2/pycompat.hpp
index 054117d625..2650554b3f 100644
--- a/modules/python/src2/pycompat.hpp
+++ b/modules/python/src2/pycompat.hpp
@@ -172,7 +172,7 @@ PyObject* pyopencv_from(const TYPE& src)
 #endif
 
 
-#define CVPY_TYPE_DECLARE(NAME, STORAGE, SNAME) \
+#define CVPY_TYPE_DECLARE(WNAME, NAME, STORAGE, SNAME) \
     struct pyopencv_##NAME##_t \
     { \
         PyObject_HEAD \
@@ -181,7 +181,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyTypeObject pyopencv_##NAME##_TypeXXX = \
     { \
         CVPY_TYPE_HEAD \
-        MODULESTR"."#NAME, \
+        MODULESTR"."#WNAME, \
         sizeof(pyopencv_##NAME##_t), \
     }; \
     static PyTypeObject * pyopencv_##NAME##_TypePtr = &pyopencv_##NAME##_TypeXXX; \
@@ -208,12 +208,12 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##NAME##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "<"#NAME" %p>", self); \
+        sprintf(str, "<"#WNAME" %p>", self); \
         return PyString_FromString(str); \
     }
 
 
-#define CVPY_TYPE_INIT_STATIC(NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
+#define CVPY_TYPE_INIT_STATIC(WNAME, NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
     { \
         pyopencv_##NAME##_TypePtr->tp_base = pyopencv_##BASE##_TypePtr; \
         pyopencv_##NAME##_TypePtr->tp_dealloc = pyopencv_##NAME##_dealloc; \
@@ -229,12 +229,12 @@ PyObject* pyopencv_from(const TYPE& src)
             ERROR_HANDLER; \
         } \
         CVPY_TYPE_INCREF(pyopencv_##NAME##_TypePtr); \
-        PyModule_AddObject(m, #NAME, (PyObject *)pyopencv_##NAME##_TypePtr); \
+        PyModule_AddObject(m, #WNAME, (PyObject *)pyopencv_##NAME##_TypePtr); \
     }
 
 //==================================================================================================
 
-#define CVPY_TYPE_DECLARE_DYNAMIC(NAME, STORAGE, SNAME) \
+#define CVPY_TYPE_DECLARE_DYNAMIC(WNAME, NAME, STORAGE, SNAME) \
     struct pyopencv_##NAME##_t \
     { \
         PyObject_HEAD \
@@ -264,7 +264,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##NAME##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "<"#NAME" %p>", self); \
+        sprintf(str, "<"#WNAME" %p>", self); \
         return PyString_FromString(str); \
     } \
     static PyType_Slot pyopencv_##NAME##_Slots[] =  \
@@ -280,14 +280,14 @@ PyObject* pyopencv_from(const TYPE& src)
     }; \
     static PyType_Spec pyopencv_##NAME##_Spec = \
     { \
-        MODULESTR"."#NAME, \
+        MODULESTR"."#WNAME, \
         sizeof(pyopencv_##NAME##_t), \
         0, \
         Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, \
         pyopencv_##NAME##_Slots  \
     };
 
-#define CVPY_TYPE_INIT_DYNAMIC(NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
+#define CVPY_TYPE_INIT_DYNAMIC(WNAME, NAME, ERROR_HANDLER, BASE, CONSTRUCTOR) \
     { \
         pyopencv_##NAME##_Slots[0].pfunc /*tp_dealloc*/ = (void*)pyopencv_##NAME##_dealloc; \
         pyopencv_##NAME##_Slots[1].pfunc /*tp_repr*/ = (void*)pyopencv_##NAME##_repr; \
@@ -302,7 +302,7 @@ PyObject* pyopencv_from(const TYPE& src)
         pyopencv_##NAME##_TypePtr = PyType_FromSpecWithBases(&pyopencv_##NAME##_Spec, bases); \
         if (!pyopencv_##NAME##_TypePtr) \
         { \
-            printf("Failed to init: " #NAME ", base (" #BASE ")" "\n"); \
+            printf("Failed to init: " #WNAME ", base (" #BASE ")" "\n"); \
             ERROR_HANDLER; \
         } \
         PyModule_AddObject(m, #NAME, (PyObject *)pyopencv_##NAME##_TypePtr); \

From 6f1eefec69b3d0c2e5e63276d20fd15b5f888d50 Mon Sep 17 00:00:00 2001
From: eplankin <elena.plankina@intel.com>
Date: Wed, 31 Mar 2021 12:24:37 +0300
Subject: [PATCH 05/11] Merge pull request #19681 from eplankin:link_problem

* Workaround for IPP linking problem

* Apply -Bsymbolic to all cases when IPP is on

* Tried to hide symbols on MacOS

* Tried on --exclude-libs option

* Fixed macos and win warnings

* Fixed win build

* cmake(IPP): move --exclude-libs,libippcore.a to IPP CMake file

Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
---
 cmake/OpenCVFindIPP.cmake | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 9bc215f415..b4f3a78f2c 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -143,10 +143,25 @@ macro(ipp_detect_version)
         list(APPEND IPP_LIBRARIES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX})
       else ()
         add_library(ipp${name} STATIC IMPORTED)
+        set(_filename "${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}")
         set_target_properties(ipp${name} PROPERTIES
           IMPORTED_LINK_INTERFACE_LIBRARIES ""
-          IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
+          IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${_filename}
         )
+        if("${name}" STREQUAL "core")  # https://github.com/opencv/opencv/pull/19681
+          if(OPENCV_FORCE_IPP_EXCLUDE_LIBS OR OPENCV_FORCE_IPP_EXCLUDE_LIBS_CORE
+              OR (UNIX AND NOT ANDROID AND NOT APPLE
+                  AND (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+              )
+              AND NOT OPENCV_SKIP_IPP_EXCLUDE_LIBS_CORE
+          )
+            if(CMAKE_VERSION VERSION_LESS "3.13.0")
+              set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs,${_filename} ${CMAKE_SHARED_LINKER_FLAGS}")
+            else()
+              target_link_options(ipp${name} INTERFACE "LINKER:--exclude-libs,${_filename}")
+            endif()
+          endif()
+        endif()
         list(APPEND IPP_LIBRARIES ipp${name})
         if (NOT BUILD_SHARED_LIBS AND (HAVE_IPP_ICV OR ";${OPENCV_INSTALL_EXTERNAL_DEPENDENCIES};" MATCHES ";ipp;"))
           # CMake doesn't support "install(TARGETS ${IPP_PREFIX}${name} " command with imported targets

From 40c0830b633b2eaf751a4766a0b6533152d30473 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 09:35:11 +0000
Subject: [PATCH 06/11] videoio(avfoundation): add getCaptureDomain()

---
 modules/videoio/src/cap_avfoundation.mm     | 3 ++-
 modules/videoio/src/cap_avfoundation_mac.mm | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/videoio/src/cap_avfoundation.mm b/modules/videoio/src/cap_avfoundation.mm
index 063e38bb54..1ff612e231 100644
--- a/modules/videoio/src/cap_avfoundation.mm
+++ b/modules/videoio/src/cap_avfoundation.mm
@@ -96,7 +96,7 @@ class CvCaptureCAM : public CvCapture {
         virtual double getProperty(int property_id) const;
         virtual bool setProperty(int property_id, double value);
         virtual int didStart();
-
+        int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
     private:
         AVCaptureSession            *mCaptureSession;
         AVCaptureDeviceInput        *mCaptureDeviceInput;
@@ -137,6 +137,7 @@ public:
     virtual double getProperty(int property_id) const;
     virtual bool setProperty(int property_id, double value);
     virtual int didStart();
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 private:
     AVAsset                  *mAsset;
     AVAssetTrack             *mAssetTrack;
diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm
index 6a52727d27..55872539ad 100644
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@@ -99,7 +99,7 @@ public:
     virtual double getProperty(int property_id) const;
     virtual bool setProperty(int property_id, double value);
     virtual int didStart();
-
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 
 private:
     AVCaptureSession            *mCaptureSession;
@@ -141,7 +141,7 @@ public:
     virtual double getProperty(int property_id) const;
     virtual bool setProperty(int property_id, double value);
     virtual int didStart();
-
+    int getCaptureDomain() /*const*/ CV_OVERRIDE { return cv::CAP_AVFOUNDATION; }
 
 private:
     AVAsset                  *mAsset;

From a2a92999beef3eb6a922a369e1fdbd15d1a51a24 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 10:16:51 +0000
Subject: [PATCH 07/11] core(arithm_op): workaround problem with scalars
 handling

---
 modules/core/src/arithm.cpp           |  3 ++-
 modules/core/test/test_operations.cpp | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 41b281c8de..a329219be2 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -623,7 +623,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
         (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
     {
-        if( checkScalar(*psrc1, type2, kind1, kind2) )
+        if ((type1 == CV_64F && (sz1.height == 1 || sz1.height == 4)) &&
+            checkScalar(*psrc1, type2, kind1, kind2))
         {
             // src1 is a scalar; swap it with src2
             swap(psrc1, psrc2);
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 645045674a..934028f3ae 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -1551,4 +1551,14 @@ TEST(Core_MatExpr, empty_check_15760)
     EXPECT_THROW(Mat c = Mat().cross(Mat()), cv::Exception);
 }
 
+TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/issues/19599 (OpenCV 4.x+ only)
+{
+    Mat a(1, 1, CV_32F, Scalar::all(1));
+    Mat b(4, 1, CV_64F, Scalar::all(1));  // MatExpr may convert Scalar to Mat
+    Mat c;
+    EXPECT_NO_THROW(cv::multiply(a, b, c));
+    EXPECT_EQ(1, c.cols);
+    EXPECT_EQ(1, c.rows);
+}
+
 }} // namespace

From b697b3162f3d5076ee0bb7e96ad1268d98259ff5 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 12:16:42 +0000
Subject: [PATCH 08/11] videoio(mjpeg): disable parallel encoder

---
 modules/videoio/src/cap_mjpeg_encoder.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index 7f4c8a6e9d..96ec488bc0 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -1167,6 +1167,8 @@ public:
         fdct_qtab(_fdct_qtab),
         cat_table(_cat_table)
     {
+#if 0  // disable parallel processing due to buffer overrun bug: https://github.com/opencv/opencv/issues/19634
+
         //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
         const int min_pixels_count = 96*96;
 
@@ -1176,6 +1178,7 @@ public:
         {
             if(height*width > min_pixels_count)
             {
+                const int default_stripes_count = 4;
                 stripes_count = default_stripes_count;
             }
         }
@@ -1191,6 +1194,12 @@ public:
 
         stripes_count = std::min(stripes_count, max_stripes);
 
+#else
+        if (nstripes > 1)
+            CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/MJPEG: parallel processing is disabled: https://github.com/opencv/opencv/issues/19634");
+        stripes_count = 1;
+#endif
+
         m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
     }
 
@@ -1370,11 +1379,8 @@ private:
     const short (&fdct_qtab)[2][64];
     const uchar* cat_table;
     int stripes_count;
-    static const int default_stripes_count;
 };
 
-const int MjpegEncoder::default_stripes_count = 4;
-
 void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
 {
     //double total_cvt = 0, total_dct = 0;

From 8069a6b4f878b93fe49f6719b4cef8a155d8a401 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 13:37:19 +0000
Subject: [PATCH 09/11] core(IPP): disable some ippsMagnitude_32f calls

---
 modules/core/src/mathfuncs_core.dispatch.cpp | 23 +++++++++++++++++++-
 modules/core/test/test_arithm.cpp            | 12 ++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/mathfuncs_core.dispatch.cpp b/modules/core/src/mathfuncs_core.dispatch.cpp
index e48f84ebbe..3c53ab1c38 100644
--- a/modules/core/src/mathfuncs_core.dispatch.cpp
+++ b/modules/core/src/mathfuncs_core.dispatch.cpp
@@ -7,6 +7,10 @@
 #include "mathfuncs_core.simd.hpp"
 #include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+
+#define IPP_DISABLE_MAGNITUDE_32F 1  // accuracy: https://github.com/opencv/opencv/issues/19506
+
+
 namespace cv { namespace hal {
 
 ///////////////////////////////////// ATAN2 ////////////////////////////////////
@@ -44,8 +48,25 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
     CV_INSTRUMENT_REGION();
 
     CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
+
+#ifdef HAVE_IPP
+    bool allowIPP = true;
+#ifdef IPP_DISABLE_MAGNITUDE_32F
+    if (cv::ipp::getIppTopFeatures() & (
+#if IPP_VERSION_X100 >= 201700
+            ippCPUID_AVX512F |
+#endif
+            ippCPUID_AVX2)
+    )
+    {
+        allowIPP = (len & 7) == 0;
+    }
+#endif
+
     // SSE42 performance issues
-    CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
+    CV_IPP_RUN((IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42) && allowIPP,
+        CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
+#endif
 
     CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len),
         CV_CPU_DISPATCH_MODES_ALL);
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 78ad907603..2746feb2f2 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -2376,4 +2376,16 @@ TEST(Core_MinMaxIdx, rows_overflow)
 }
 
 
+TEST(Core_Magnitude, regression_19506)
+{
+    for (int N = 1; N <= 64; ++N)
+    {
+        Mat a(1, N, CV_32FC1, Scalar::all(1e-20));
+        Mat res;
+        magnitude(a, a, res);
+        EXPECT_LE(cvtest::norm(res, NORM_L1), 1e-15) << N;
+    }
+}
+
+
 }} // namespace

From d651ff8d6b9669b8214876568694c1dca7c2ef8a Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 14:17:45 +0000
Subject: [PATCH 10/11] python: exception-free pyopencv_to() wrapper

---
 modules/python/src2/cv2.cpp | 28 ++++++++++++++++++++++++++--
 modules/python/src2/gen2.py | 12 ++++++------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 8bb15cd43f..0806e03552 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -44,6 +44,7 @@
 
 #define CV_HAS_CONVERSION_ERROR(x) (((x) == -1) && PyErr_Occurred())
 
+static PyObject* opencv_error = NULL;
 
 class ArgInfo
 {
@@ -66,14 +67,32 @@ struct PyOpenCV_Converter
     //static inline PyObject* from(const T& src);
 };
 
+// exception-safe pyopencv_to
+template<typename _Tp> static
+bool pyopencv_to_safe(PyObject* obj, _Tp& value, const ArgInfo& info)
+{
+    try
+    {
+        return pyopencv_to(obj, value, info);
+    }
+    catch (const std::exception &e)
+    {
+        PyErr_SetString(opencv_error, cv::format("Conversion error: %s, what: %s", info.name, e.what()).c_str());
+        return false;
+    }
+    catch (...)
+    {
+        PyErr_SetString(opencv_error, cv::format("Conversion error: %s", info.name).c_str());
+        return false;
+    }
+}
+
 template<typename T> static
 bool pyopencv_to(PyObject* obj, T& p, const ArgInfo& info) { return PyOpenCV_Converter<T>::to(obj, p, info); }
 
 template<typename T> static
 PyObject* pyopencv_from(const T& src) { return PyOpenCV_Converter<T>::from(src); }
 
-static PyObject* opencv_error = NULL;
-
 static bool isPythonBindingsDebugEnabled()
 {
     static bool param_debug = cv::utils::getConfigurationParameterBool("OPENCV_PYTHON_DEBUG", false);
@@ -211,6 +230,11 @@ catch (const std::exception &e) \
 { \
     PyErr_SetString(opencv_error, e.what()); \
     return 0; \
+} \
+catch (...) \
+{ \
+    PyErr_SetString(opencv_error, "Unknown C++ exception from OpenCV code"); \
+    return 0; \
 }
 
 using namespace cv;
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 4acca07ada..b25647c4b3 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -47,7 +47,7 @@ gen_template_func_body = Template("""$code_decl
 gen_template_mappable = Template("""
     {
         ${mappable} _src;
-        if (pyopencv_to(src, _src, info))
+        if (pyopencv_to_safe(src, _src, info))
         {
             return cv_mappable_to(_src, dst);
         }
@@ -91,7 +91,7 @@ gen_template_set_prop_from_map = Template("""
     if( PyMapping_HasKeyString(src, (char*)"$propname") )
     {
         tmp = PyMapping_GetItemString(src, (char*)"$propname");
-        ok = tmp && pyopencv_to(tmp, dst.$propname, ArgInfo("$propname", false));
+        ok = tmp && pyopencv_to_safe(tmp, dst.$propname, ArgInfo("$propname", false));
         Py_DECREF(tmp);
         if(!ok) return false;
     }""")
@@ -145,7 +145,7 @@ static int pyopencv_${name}_set_${member}(pyopencv_${name}_t* p, PyObject *value
         PyErr_SetString(PyExc_TypeError, "Cannot delete the ${member} attribute");
         return -1;
     }
-    return pyopencv_to(value, p->v${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, p->v${access}${member}, ArgInfo("value", false)) ? 0 : -1;
 }
 """)
 
@@ -163,7 +163,7 @@ static int pyopencv_${name}_set_${member}(pyopencv_${name}_t* p, PyObject *value
         failmsgp("Incorrect type of object (must be '${name}' or its derivative)");
         return -1;
     }
-    return pyopencv_to(value, _self_${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, _self_${access}${member}, ArgInfo("value", false)) ? 0 : -1;
 }
 """)
 
@@ -281,7 +281,7 @@ class ClassInfo(object):
         code = "static bool pyopencv_to(PyObject* src, %s& dst, const ArgInfo& info)\n{\n    PyObject* tmp;\n    bool ok;\n" % (self.cname)
         code += "".join([gen_template_set_prop_from_map.substitute(propname=p.name,proptype=p.tp) for p in self.props])
         if self.base:
-            code += "\n    return pyopencv_to(src, (%s&)dst, info);\n}\n" % all_classes[self.base].cname
+            code += "\n    return pyopencv_to_safe(src, (%s&)dst, info);\n}\n" % all_classes[self.base].cname
         else:
             code += "\n    return true;\n}\n"
         return code
@@ -665,7 +665,7 @@ class FuncInfo(object):
                         if a.tp == 'char':
                             code_cvt_list.append("convert_to_char(pyobj_%s, &%s, %s)" % (a.name, a.name, a.crepr()))
                         else:
-                            code_cvt_list.append("pyopencv_to(pyobj_%s, %s, %s)" % (a.name, a.name, a.crepr()))
+                            code_cvt_list.append("pyopencv_to_safe(pyobj_%s, %s, %s)" % (a.name, a.name, a.crepr()))
 
                 all_cargs.append([arg_type_info, parse_name])
 

From 2b86de217a7dabe56888ede236dd7229127eb33e Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 31 Mar 2021 23:14:45 +0000
Subject: [PATCH 11/11] cmake: fix order of headers

- cmake uses filesystem's order which may vary
- unpredictable headers order may cause build failures (primary bindings)
---
 cmake/OpenCVModule.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index efb6ca8fa9..bcbca833f2 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -876,7 +876,9 @@ endmacro()
 macro(_ocv_create_module)
 
   ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
-  set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
+  set(__module_headers ${OPENCV_MODULE_${the_module}_HEADERS})
+  list(SORT __module_headers)  # fix headers order, useful for bindings
+  set(OPENCV_MODULE_${the_module}_HEADERS ${__module_headers} CACHE INTERNAL "List of header files for ${the_module}")
   set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
 
   # The condition we ought to be testing here is whether ocv_add_precompiled_headers will