Merge pull request #18952 from terfendail:wui_doc

* Updated UI documentation to address WUI * Added documentation for vx_ calls * Removed vx_store operation overload * Doxyfile updated to enable wide UI * Enable doxygen documentation for vx_ WUI functions * Wide intrinsics definition rework * core: fix SIMD C++ emulator build (supports 128-bit only)
4 years ago · aab62aa6dd
parent ac9f3a1242
commit aab62aa6dd
6 changed files with 912 additions and 157 deletions
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -249,6 +249,12 @@ PREDEFINED             = __cplusplus=1 \
                         CV_DEFAULT(x)=" = x" \
                         CV_NEON=1 \
                         CV_SSE2=1 \
+                         CV_SIMD128=1 \
+                         CV_SIMD256=1 \
+                         CV_SIMD512=1 \
+                         CV_SIMD128_64F=1 \
+                         CV_SIMD256_64F=1 \
+                         CV_SIMD512_64F=1 \
                         CV__DEBUG_NS_BEGIN= \
                         CV__DEBUG_NS_END= \
                         CV_DEPRECATED_EXTERNAL= \
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -104,7 +104,7 @@ template<typename _Tp> struct V_TypeTraits
 {
 };

-#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
    template<> struct V_TypeTraits<type> \
    { \
        typedef type value_type; \
@ -114,7 +114,6 @@ template<typename _Tp> struct V_TypeTraits
        typedef w_type_ w_type; \
        typedef q_type_ q_type; \
        typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
    \
        static inline int_type reinterpret_int(type x) \
        { \
@ -131,7 +130,7 @@ template<typename _Tp> struct V_TypeTraits
        } \
    }

-#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_, nlanes128_) \
+#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
    template<> struct V_TypeTraits<type> \
    { \
        typedef type value_type; \
@ -140,7 +139,6 @@ template<typename _Tp> struct V_TypeTraits
        typedef uint_type_ uint_type; \
        typedef w_type_ w_type; \
        typedef sum_type_ sum_type; \
-        enum { nlanes128 = nlanes128_ }; \
    \
        static inline int_type reinterpret_int(type x) \
        { \
@ -157,16 +155,16 @@ template<typename _Tp> struct V_TypeTraits
        } \
    }

-CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
-CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
-CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float, 4);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64, 2);
-CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);

 #ifndef CV_DOXYGEN

@ -310,54 +308,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 //==================================================================================================

-#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
-    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
-    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
-    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
-    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
-    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
-    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
-    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
-    inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
-    inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
-
-#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
-
 template<typename _Tp> struct V_RegTraits
 {
 };
@ -417,6 +367,7 @@ template<typename _Tp> struct V_RegTraits
    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
 #endif
+//! @endcond

 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
@ -425,21 +376,33 @@ namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD_64F CV_SIMD512_64F
    #define CV_SIMD_FP16 CV_SIMD512_FP16
    #define CV_SIMD_WIDTH 64
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
    typedef v_uint8x64    v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
    typedef v_int8x64     v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
    typedef v_uint16x32   v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
    typedef v_int16x32    v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
    typedef v_uint32x16   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
    typedef v_int32x16    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
    typedef v_uint64x8    v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
    typedef v_int64x8     v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
    typedef v_float32x16  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
-#if CV_SIMD512_64F
+    #if CV_SIMD512_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
    typedef v_float64x8   v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
-#endif
-        inline void vx_cleanup() { v512_cleanup(); }
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v512##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
@ -449,21 +412,33 @@ namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD_64F CV_SIMD256_64F
    #define CV_SIMD_FP16 CV_SIMD256_FP16
    #define CV_SIMD_WIDTH 32
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
    typedef v_uint8x32   v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
    typedef v_int8x32    v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
    typedef v_uint16x16  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
    typedef v_int16x16   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
    typedef v_uint32x8   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
    typedef v_int32x8    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
    typedef v_uint64x4   v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
    typedef v_int64x4    v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
    typedef v_float32x8  v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    #if CV_SIMD256_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
    typedef v_float64x4  v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    #endif
-    inline void vx_cleanup() { v256_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v256##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
@ -476,25 +451,214 @@ namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD CV_SIMD128
    #define CV_SIMD_64F CV_SIMD128_64F
    #define CV_SIMD_WIDTH 16
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
    typedef v_uint8x16  v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
    typedef v_int8x16   v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
    typedef v_uint16x8  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
    typedef v_int16x8   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
    typedef v_uint32x4  v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
    typedef v_int32x4   v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
    typedef v_uint64x2  v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
    typedef v_int64x2   v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
    typedef v_float32x4 v_float32;
-    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
    #if CV_SIMD128_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
    typedef v_float64x2 v_float64;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
    #endif
-    inline void vx_cleanup() { v_cleanup(); }
+//! @}
+
+    #define VXPREFIX(func) v##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #endif

+namespace CV__SIMD_NAMESPACE {
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @name Wide init with value
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to a specific value
+    inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
+    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
+    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
+    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
+    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
+    inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
+    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
+    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
+    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
+#endif
+    //! @}
+
+    //! @name Wide init with zero
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to zero
+    inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
+    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
+    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
+    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
+    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
+    inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
+    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
+    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
+    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory
+    inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory(aligned)
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory(aligned)
+    inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load lower half from memory
+    //! @{
+    //! @brief Load lower half of maximum available capacity register from memory
+    inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load halfs from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from two memory blocks
+    inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of elements
+    //! @{
+    //! @brief Load maximum available capacity register contents with array elements by provided indexes
+    inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element pairs
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
+    inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element quads
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element quads by provided indexes
+    inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    //! @}
+
+    //! @name Wide load with double expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with double expand
+    inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    //! @}
+
+    //! @name Wide load with quad expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with quad expand
+    inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    //! @}
+
+    /** @brief SIMD processing state cleanup call */
+    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+//! @}
+    #undef VXPREFIX
+} // namespace
+
+//! @cond IGNORED
 #ifndef CV_SIMD_64F
 #define CV_SIMD_64F 0
 #endif
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@ -536,8 +536,8 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
            {
                v_int32 t0 = vx_load((int*)(src0 + i));
                v_int32 t1 = vx_load((int*)(src1 + i));
-                vx_store((int*)(dst0 + i), t1);
-                vx_store((int*)(dst1 + i), t0);
+                v_store((int*)(dst0 + i), t1);
+                v_store((int*)(dst1 + i), t0);
            }
        }
 #if CV_STRONG_ALIGNMENT
@ -547,8 +547,8 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
            {
                v_uint8 t0 = vx_load(src0 + i);
                v_uint8 t1 = vx_load(src1 + i);
-                vx_store(dst0 + i, t1);
-                vx_store(dst1 + i, t0);
+                v_store(dst0 + i, t1);
+                v_store(dst1 + i, t0);
            }
        }
 #endif
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1466,7 +1466,7 @@ template<typename R> struct TheTest
        R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
        R r2(r1);
        EXPECT_EQ(1.0f, r1.get0());
-        vx_store(data_f32.a.d, r2);
+        v_store(data_f32.a.d, r2);
        EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);

        out.a.clear();
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -1023,7 +1023,7 @@ public:
                                                             v20*vw20 + v21*vw21 + v22*vw22 + vbias;
                                            if (relu)
                                                vout = v_select(vout > z, vout, vout*vrc);
-                                            vx_store(outptr + out_j, vout);
+                                            v_store(outptr + out_j, vout);
                                        }
                                    }
                                #endif