Merge pull request #20412 from joy2myself:rvv-0.10

bug fixes for universal intrinsics of RISC-V back-end * Align universal intrinsic comparator behaviour with other platforms Set all bits to one for return value of int and fp comparators. * fix v_pack_triplets, v_pack_store and v_pack_u_store * Remove redundant CV_DECL_ALIGNED statements Co-authored-by: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
3 years ago · acc576658a
parent d29c7e7871
commit acc576658a
1 changed files with 151 additions and 148 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
@ -737,7 +737,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f6

 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
    {
        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
        ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
@ -748,7 +748,7 @@ inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v

 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
    {
        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
    };
@ -758,7 +758,7 @@ inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return

 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
    {
        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
    };
@ -766,7 +766,7 @@ inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
 }
 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
    {
        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
    };
@ -776,7 +776,7 @@ inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { re

 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
 {
-    int64 CV_DECL_ALIGNED(32) elems[2] =
+    int64 elems[2] =
    {
        ptr0[0], ptr1[0]
    };
@ -787,7 +787,7 @@ inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return
 #if CV_SIMD128_64F
 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        ptr0[0], ptr1[0]
    };
@ -800,7 +800,7 @@ inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)

 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
    {
        tab[idx[ 0]],
        tab[idx[ 1]],
@ -823,7 +823,7 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
 }
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -846,7 +846,7 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -873,7 +873,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint

 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
    {
        tab[idx[0]],
        tab[idx[1]],
@ -888,7 +888,7 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -903,7 +903,7 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -922,7 +922,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein

 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
    {
        tab[idx[0]],
        tab[idx[1]],
@ -933,7 +933,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -953,7 +953,7 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re

 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    int64_t CV_DECL_ALIGNED(32) elems[2] =
+    int64_t elems[2] =
    {
        tab[idx[0]],
        tab[idx[1]]
@ -969,7 +969,7 @@ inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_rein

 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
    {
        tab[idx[0]],
        tab[idx[1]],
@ -980,7 +980,7 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
    {
        tab[idx[0]],
        tab[idx[0] + 1],
@ -996,7 +996,7 @@ inline v_float32x4 v_lut_quads(const float* tab, const int* idx)

 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
    {
        tab[v_extract_n<0>(idxvec)],
        tab[v_extract_n<1>(idxvec)],
@ -1008,7 +1008,7 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)

 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    unsigned elems[4] =
    {
        tab[v_extract_n<0>(idxvec)],
        tab[v_extract_n<1>(idxvec)],
@ -1020,7 +1020,7 @@ inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)

 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
    {
        tab[v_extract_n<0>(idxvec)],
        tab[v_extract_n<1>(idxvec)],
@ -1032,7 +1032,7 @@ inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)

 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    int CV_DECL_ALIGNED(32) idx[4];
+    int idx[4];
    v_store_aligned(idx, idxvec);

    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
@ -1042,7 +1042,7 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa
 #if CV_SIMD128_64F
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        tab[idx[0]],
        tab[idx[1]]
@ -1057,7 +1057,7 @@ inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)

 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        tab[v_extract_n<0>(idxvec)],
        tab[v_extract_n<1>(idxvec)]
@ -1067,7 +1067,7 @@ inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)

 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
 {
-    int CV_DECL_ALIGNED(32) idx[4] = {0};
+    int idx[4] = {0};
    v_store_aligned(idx, idxvec);

    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
@ -1079,7 +1079,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo

 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
-    ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
+    ushort ptr[16] = {0};
    v_store(ptr, a);
    v_store(ptr + 8, b);
    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
@ -1088,7 +1088,7 @@ inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
                           const v_uint32x4& c, const v_uint32x4& d)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
    v_store(ptr, a);
    v_store(ptr + 4, b);
    v_store(ptr + 8, c);
@ -1100,7 +1100,7 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
                           const v_uint64x2& g, const v_uint64x2& h)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
+    uint64 ptr[16] = {0};
    v_store(ptr, a);
    v_store(ptr + 2, b);
    v_store(ptr + 4, c);
@ -1279,13 +1279,15 @@ OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), 1, vl)); \
+    uint64_t ones = -1; \
+    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
 }

 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), 1, vl)); \
+    union { uint64 u; double d; } ones; ones.u = -1; \
+    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
 }

 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
@ -1441,7 +1443,7 @@ OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
    {
        v_reduce_sum(a),
        v_reduce_sum(b),
@ -1746,9 +1748,9 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)

 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    double ptr[4] = {0};
    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        ptr[0], ptr[1]
    };
@ -1757,9 +1759,9 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)

 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    double ptr[4] = {0};
    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        ptr[2], ptr[3]
    };
@ -1768,9 +1770,9 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)

 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    double ptr[4] = {0};
    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        ptr[0], ptr[1]
    };
@ -1779,9 +1781,9 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)

 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
+    double ptr[4] = {0};
    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
    {
        ptr[2], ptr[3]
    };
@ -1823,7 +1825,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
 { \
-    _Tp CV_DECL_ALIGNED(32) elems0[4] = \
+    _Tp elems0[4] = \
    { \
        v_extract_n<0>(a0), \
        v_extract_n<0>(a1), \
@ -1831,7 +1833,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
        v_extract_n<0>(a3) \
    }; \
    b0 = v_load(elems0); \
-    _Tp CV_DECL_ALIGNED(32) elems1[4] = \
+    _Tp elems1[4] = \
    { \
        v_extract_n<1>(a0), \
        v_extract_n<1>(a1), \
@ -1839,7 +1841,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
        v_extract_n<1>(a3) \
    }; \
    b1 = v_load(elems1); \
-    _Tp CV_DECL_ALIGNED(32) elems2[4] = \
+    _Tp elems2[4] = \
    { \
        v_extract_n<2>(a0), \
        v_extract_n<2>(a1), \
@ -1847,7 +1849,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
        v_extract_n<2>(a3) \
    }; \
    b2 = v_load(elems2); \
-    _Tp CV_DECL_ALIGNED(32) elems3[4] = \
+    _Tp elems3[4] = \
    { \
        v_extract_n<3>(a0), \
        v_extract_n<3>(a1), \
@ -1866,8 +1868,8 @@ OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_reverse(const _Tpvec& a)  \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptra[_Tpvec::nlanes] = {0}; \
    v_store(ptra, a); \
    for (int i = 0; i < _Tpvec::nlanes; i++) \
    { \
@ -1894,8 +1896,8 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 { \
-    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
    v_store_low(lptr, a); \
    v_store_high(hptr, a); \
    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
@ -1903,13 +1905,13 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 } \
 inline _Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
    v_store_low(lptr, a); \
    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
 } \
 inline _Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
    v_store_high(hptr, a); \
    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
 } \
@ -1936,25 +1938,25 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
 }


-#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr, hvl, vl) \
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, b); \
    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
 } \
 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
-    v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
 } \
 template<int n> inline \
 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, b); \
    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
@ -1962,39 +1964,39 @@ _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
 template<int n> inline \
 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
    v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl))); \
 }

-OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
-OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
-OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
-OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)


-#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast, vl) \
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, b); \
    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
 } \
 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, vl))); \
-    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
 } \
 template<int n> inline \
 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
    v_store(arr + _wTpvec::nlanes, b); \
    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
@ -2002,23 +2004,23 @@ _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, vl))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
 }

-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 16)
-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 8)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)


 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptra0, a0); \
    v_store(ptra1, a1); \
    int i; \
@ -2037,16 +2039,16 @@ inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
    v_store_low(ptra, a); \
    v_store_low(ptrb, b); \
    return v_load_halves(ptra, ptrb); \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
    v_store_high(ptra, a); \
    v_store_high(ptrb, b); \
    return v_load_halves(ptra, ptrb); \
@ -2072,8 +2074,8 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
    int i, i2; \
    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
    { \
@ -2085,9 +2087,9 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
    int i, i3; \
    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
    { \
@ -2102,10 +2104,10 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
                                v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
    int i, i4; \
    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
    { \
@ -2123,8 +2125,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    int i, i2; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptra, a); \
    v_store(ptrb, b); \
    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
@ -2137,9 +2139,9 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
    int i, i3; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptra, a); \
    v_store(ptrb, b); \
    v_store(ptrc, c); \
@ -2155,10 +2157,10 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
    int i, i4; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptra, a); \
    v_store(ptrb, b); \
    v_store(ptrc, c); \
@ -2173,8 +2175,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
 } \
 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptrvec, vec); \
    for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
    { \
@ -2187,8 +2189,8 @@ inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
 } \
 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
    v_store(ptrvec, vec); \
    for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
    { \
@ -2242,9 +2244,9 @@ static const unsigned char popCountTable[] =
 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
 inline _rTpvec v_popcount(const _Tpvec& a) \
 { \
-    uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
+    uchar ptra[16] = {0}; \
    v_store(ptra, v_reinterpret_as_u8(a)); \
-    _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _rTp ptr[_Tpvec::nlanes] = {0}; \
    v_store(ptr, v_setzero_##suffix()); \
    for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
        ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
@ -2298,7 +2300,7 @@ inline int v_signmask(const v_float64x2& a)
 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
 inline int v_scan_forward(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
    v_store(ptr, v_reinterpret_as_##suffix(a)); \
    for (int i = 0; i < _Tpvec::nlanes; i++) \
        if(int(ptr[i]) < 0) \
@ -2321,28 +2323,29 @@ OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)

 //////////// Pack triplets ////////////

-#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
-inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
-{ \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
-    v_store(ptrvec, vec); \
-    for (int i = 0; i < _Tpvec::nlanes/4; i++) \
-    { \
-        ptr[3*i  ] = ptrvec[4*i  ]; \
-        ptr[3*i+1] = ptrvec[4*i+2]; \
-        ptr[3*i+2] = ptrvec[4*i+2]; \
-    } \
-    return v_load(ptr); \
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)vint8m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vint16m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{
+    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
 }

-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }


 ////// FP16 support ///////
@ -2443,7 +2446,7 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
    v_int32x4 t1, t2;
    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
    v_load_deinterleave(ptr, t1, t2);
@ -2451,7 +2454,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
    v_int32x4 t1, t2;
    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
    v_load_deinterleave(ptr, t1, t2);
@ -2461,7 +2464,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
    v_int64x2 t1, t2;
    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
    v_load_deinterleave(ptr, t1, t2);
@ -2469,7 +2472,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
    v_int64x2 t1, t2;
    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
    v_load_deinterleave(ptr, t1, t2);
@ -2479,7 +2482,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
    v_uint32x4 t1, t2, t3, t4;
    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2488,7 +2491,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                   const v_uint32x4& c)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
    v_uint32x4 t1, t2, t3, t4;
    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2497,7 +2500,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,

 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
    v_int32x4 t1, t2, t3, t4;
    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2506,7 +2509,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                  const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
    v_int32x4 t1, t2, t3, t4;
    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2516,7 +2519,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
    v_uint64x2 t1, t2, t3, t4;
    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2524,7 +2527,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
    v_uint64x2 t1, t2, t3, t4;
    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2533,7 +2536,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, con

 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
    v_int64x2 t1, t2, t3, t4;
    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2542,7 +2545,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                  const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
    v_int64x2 t1, t2, t3, t4;
    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
    v_load_deinterleave(ptr, t1, t2, t3, t4);
@ -2563,7 +2566,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 // 16 >> 32
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
    v_int32x4 t1 = v_load(ptr);
    v_int32x4 t2 = v_load(ptr+4);
@ -2571,7 +2574,7 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
    v_int32x4 t1 = v_load(ptr);
    v_int32x4 t2 = v_load(ptr+4);
@ -2581,7 +2584,7 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
    v_int64x2 t1 = v_load(ptr);
    v_int64x2 t2 = v_load(ptr+2);
@ -2589,7 +2592,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
    v_int64x2 t1 = v_load(ptr);
    v_int64x2 t2 = v_load(ptr+2);
@ -2600,7 +2603,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
    v_uint32x4 t1 = v_load(ptr);
    v_uint32x4 t2 = v_load(ptr+4);
@ -2610,7 +2613,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
    v_uint32x4 t1 = v_load(ptr);
    v_uint32x4 t2 = v_load(ptr+4);
@ -2620,7 +2623,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
    v_int32x4 t1 = v_load(ptr);
    v_int32x4 t2 = v_load(ptr+4);
@ -2630,7 +2633,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
    v_int32x4 t1 = v_load(ptr);
    v_int32x4 t2 = v_load(ptr+4);
@ -2642,7 +2645,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, c
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
    v_uint64x2 t1 = v_load(ptr);
    v_uint64x2 t2 = v_load(ptr+2);
@ -2652,7 +2655,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
    v_uint64x2 t1 = v_load(ptr);
    v_uint64x2 t2 = v_load(ptr+2);
@ -2662,7 +2665,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
    v_int64x2 t1 = v_load(ptr);
    v_int64x2 t2 = v_load(ptr+2);
@ -2672,7 +2675,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
    v_int64x2 t1 = v_load(ptr);
    v_int64x2 t2 = v_load(ptr+2);
@ -2714,7 +2717,7 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
 { \
-    _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
+    _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
    vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \