@ -737,7 +737,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f6
inline v_int8x16 v_load_halves ( const schar * ptr0 , const schar * ptr1 )
{
schar CV_DECL_ALIGNED ( 32 ) elems [ 16 ] =
schar elems [ 16 ] =
{
ptr0 [ 0 ] , ptr0 [ 1 ] , ptr0 [ 2 ] , ptr0 [ 3 ] , ptr0 [ 4 ] , ptr0 [ 5 ] , ptr0 [ 6 ] , ptr0 [ 7 ] ,
ptr1 [ 0 ] , ptr1 [ 1 ] , ptr1 [ 2 ] , ptr1 [ 3 ] , ptr1 [ 4 ] , ptr1 [ 5 ] , ptr1 [ 6 ] , ptr1 [ 7 ]
@ -748,7 +748,7 @@ inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v
inline v_int16x8 v_load_halves ( const short * ptr0 , const short * ptr1 )
{
short CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
short elems [ 8 ] =
{
ptr0 [ 0 ] , ptr0 [ 1 ] , ptr0 [ 2 ] , ptr0 [ 3 ] , ptr1 [ 0 ] , ptr1 [ 1 ] , ptr1 [ 2 ] , ptr1 [ 3 ]
} ;
@ -758,7 +758,7 @@ inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return
inline v_int32x4 v_load_halves ( const int * ptr0 , const int * ptr1 )
{
int CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
int elems [ 4 ] =
{
ptr0 [ 0 ] , ptr0 [ 1 ] , ptr1 [ 0 ] , ptr1 [ 1 ]
} ;
@ -766,7 +766,7 @@ inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
}
inline v_float32x4 v_load_halves ( const float * ptr0 , const float * ptr1 )
{
float CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
float elems [ 4 ] =
{
ptr0 [ 0 ] , ptr0 [ 1 ] , ptr1 [ 0 ] , ptr1 [ 1 ]
} ;
@ -776,7 +776,7 @@ inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { re
inline v_int64x2 v_load_halves ( const int64 * ptr0 , const int64 * ptr1 )
{
int64 CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
int64 elems [ 2 ] =
{
ptr0 [ 0 ] , ptr1 [ 0 ]
} ;
@ -787,7 +787,7 @@ inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return
# if CV_SIMD128_64F
inline v_float64x2 v_load_halves ( const double * ptr0 , const double * ptr1 )
{
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
ptr0 [ 0 ] , ptr1 [ 0 ]
} ;
@ -800,7 +800,7 @@ inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
inline v_int8x16 v_lut ( const schar * tab , const int * idx )
{
schar CV_DECL_ALIGNED ( 32 ) elems [ 16 ] =
schar elems [ 16 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ] ,
@ -823,7 +823,7 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
}
inline v_int8x16 v_lut_pairs ( const schar * tab , const int * idx )
{
schar CV_DECL_ALIGNED ( 32 ) elems [ 16 ] =
schar elems [ 16 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -846,7 +846,7 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
}
inline v_int8x16 v_lut_quads ( const schar * tab , const int * idx )
{
schar CV_DECL_ALIGNED ( 32 ) elems [ 16 ] =
schar elems [ 16 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -873,7 +873,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
inline v_int16x8 v_lut ( const short * tab , const int * idx )
{
short CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
short elems [ 8 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ] ,
@ -888,7 +888,7 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
}
inline v_int16x8 v_lut_pairs ( const short * tab , const int * idx )
{
short CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
short elems [ 8 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -903,7 +903,7 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
}
inline v_int16x8 v_lut_quads ( const short * tab , const int * idx )
{
short CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
short elems [ 8 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -922,7 +922,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
inline v_int32x4 v_lut ( const int * tab , const int * idx )
{
int CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
int elems [ 4 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ] ,
@ -933,7 +933,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
}
inline v_int32x4 v_lut_pairs ( const int * tab , const int * idx )
{
int CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
int elems [ 4 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -953,7 +953,7 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
inline v_int64x2 v_lut ( const int64_t * tab , const int * idx )
{
int64_t CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
int64_t elems [ 2 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ]
@ -969,7 +969,7 @@ inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_rein
inline v_float32x4 v_lut ( const float * tab , const int * idx )
{
float CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
float elems [ 4 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ] ,
@ -980,7 +980,7 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
}
inline v_float32x4 v_lut_pairs ( const float * tab , const int * idx )
{
float CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
float elems [ 4 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 0 ] + 1 ] ,
@ -996,7 +996,7 @@ inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
inline v_int32x4 v_lut ( const int * tab , const v_int32x4 & idxvec )
{
int CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
int elems [ 4 ] =
{
tab [ v_extract_n < 0 > ( idxvec ) ] ,
tab [ v_extract_n < 1 > ( idxvec ) ] ,
@ -1008,7 +1008,7 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
inline v_uint32x4 v_lut ( const unsigned * tab , const v_int32x4 & idxvec )
{
unsigned CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
unsigned elems [ 4 ] =
{
tab [ v_extract_n < 0 > ( idxvec ) ] ,
tab [ v_extract_n < 1 > ( idxvec ) ] ,
@ -1020,7 +1020,7 @@ inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
inline v_float32x4 v_lut ( const float * tab , const v_int32x4 & idxvec )
{
float CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
float elems [ 4 ] =
{
tab [ v_extract_n < 0 > ( idxvec ) ] ,
tab [ v_extract_n < 1 > ( idxvec ) ] ,
@ -1032,7 +1032,7 @@ inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
inline void v_lut_deinterleave ( const float * tab , const v_int32x4 & idxvec , v_float32x4 & x , v_float32x4 & y )
{
int CV_DECL_ALIGNED ( 32 ) idx [ 4 ] ;
int idx [ 4 ] ;
v_store_aligned ( idx , idxvec ) ;
x = v_float32x4 ( tab [ idx [ 0 ] ] , tab [ idx [ 1 ] ] , tab [ idx [ 2 ] ] , tab [ idx [ 3 ] ] ) ;
@ -1042,7 +1042,7 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa
# if CV_SIMD128_64F
inline v_float64x2 v_lut ( const double * tab , const int * idx )
{
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
tab [ idx [ 0 ] ] ,
tab [ idx [ 1 ] ]
@ -1057,7 +1057,7 @@ inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
inline v_float64x2 v_lut ( const double * tab , const v_int32x4 & idxvec )
{
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
tab [ v_extract_n < 0 > ( idxvec ) ] ,
tab [ v_extract_n < 1 > ( idxvec ) ]
@ -1067,7 +1067,7 @@ inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
inline void v_lut_deinterleave ( const double * tab , const v_int32x4 & idxvec , v_float64x2 & x , v_float64x2 & y )
{
int CV_DECL_ALIGNED ( 32 ) idx [ 4 ] = { 0 } ;
int idx [ 4 ] = { 0 } ;
v_store_aligned ( idx , idxvec ) ;
x = v_float64x2 ( tab [ idx [ 0 ] ] , tab [ idx [ 1 ] ] ) ;
@ -1079,7 +1079,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
inline v_uint8x16 v_pack_b ( const v_uint16x8 & a , const v_uint16x8 & b )
{
ushort CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
ushort ptr [ 16 ] = { 0 } ;
v_store ( ptr , a ) ;
v_store ( ptr + 8 , b ) ;
return v_uint8x16 ( vnsrl_wx_u8m1 ( vle16_v_u16m2 ( ptr , 16 ) , 0 , 16 ) ) ;
@ -1088,7 +1088,7 @@ inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
inline v_uint8x16 v_pack_b ( const v_uint32x4 & a , const v_uint32x4 & b ,
const v_uint32x4 & c , const v_uint32x4 & d )
{
unsigned CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
unsigned ptr [ 16 ] = { 0 } ;
v_store ( ptr , a ) ;
v_store ( ptr + 4 , b ) ;
v_store ( ptr + 8 , c ) ;
@ -1100,7 +1100,7 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
const v_uint64x2 & d , const v_uint64x2 & e , const v_uint64x2 & f ,
const v_uint64x2 & g , const v_uint64x2 & h )
{
uint64 CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
uint64 ptr [ 16 ] = { 0 } ;
v_store ( ptr , a ) ;
v_store ( ptr + 2 , b ) ;
v_store ( ptr + 4 , c ) ;
@ -1279,13 +1279,15 @@ OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
# define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
inline _Tpvec operator op ( const _Tpvec & a , const _Tpvec & b ) \
{ \
return _Tpvec ( vmerge_vxm_ # # suffix # # m1 ( intrin ( a , b , vl ) , vmv_v_x_ # # suffix # # m1 ( 0 , vl ) , 1 , vl ) ) ; \
uint64_t ones = - 1 ; \
return _Tpvec ( vmerge_vxm_ # # suffix # # m1 ( intrin ( a , b , vl ) , vmv_v_x_ # # suffix # # m1 ( 0 , vl ) , ones , vl ) ) ; \
}
# define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
inline _Tpvec operator op ( const _Tpvec & a , const _Tpvec & b ) \
{ \
return _Tpvec ( vfmerge_vfm_ # # suffix # # m1 ( intrin ( a , b , vl ) , vfmv_v_f_ # # suffix # # m1 ( 0 , vl ) , 1 , vl ) ) ; \
union { uint64 u ; double d ; } ones ; ones . u = - 1 ; \
return _Tpvec ( vfmerge_vfm_ # # suffix # # m1 ( intrin ( a , b , vl ) , vfmv_v_f_ # # suffix # # m1 ( 0 , vl ) , ones . d , vl ) ) ; \
}
# define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
@ -1441,7 +1443,7 @@ OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
inline v_float32x4 v_reduce_sum4 ( const v_float32x4 & a , const v_float32x4 & b ,
const v_float32x4 & c , const v_float32x4 & d )
{
float CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =
float elems [ 4 ] =
{
v_reduce_sum ( a ) ,
v_reduce_sum ( b ) ,
@ -1746,9 +1748,9 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
inline v_float64x2 v_cvt_f64 ( const v_int32x4 & a )
{
double CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
double ptr [ 4 ] = { 0 } ;
vse64_v_f64m2 ( ptr , vfwcvt_f_x_v_f64m2 ( a , 4 ) , 4 ) ;
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
ptr [ 0 ] , ptr [ 1 ]
} ;
@ -1757,9 +1759,9 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
inline v_float64x2 v_cvt_f64_high ( const v_int32x4 & a )
{
double CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
double ptr [ 4 ] = { 0 } ;
vse64_v_f64m2 ( ptr , vfwcvt_f_x_v_f64m2 ( a , 4 ) , 4 ) ;
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
ptr [ 2 ] , ptr [ 3 ]
} ;
@ -1768,9 +1770,9 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
inline v_float64x2 v_cvt_f64 ( const v_float32x4 & a )
{
double CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
double ptr [ 4 ] = { 0 } ;
vse64_v_f64m2 ( ptr , vfwcvt_f_f_v_f64m2 ( a , 4 ) , 4 ) ;
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
ptr [ 0 ] , ptr [ 1 ]
} ;
@ -1779,9 +1781,9 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
inline v_float64x2 v_cvt_f64_high ( const v_float32x4 & a )
{
double CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
double ptr [ 4 ] = { 0 } ;
vse64_v_f64m2 ( ptr , vfwcvt_f_f_v_f64m2 ( a , 4 ) , 4 ) ;
double CV_DECL_ALIGNED ( 32 ) elems [ 2 ] =
double elems [ 2 ] =
{
ptr [ 2 ] , ptr [ 3 ]
} ;
@ -1823,7 +1825,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
v_ # # _Tpvec & b0 , v_ # # _Tpvec & b1 , \
v_ # # _Tpvec & b2 , v_ # # _Tpvec & b3 ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) elems0 [ 4 ] = \
_Tp elems0 [ 4 ] = \
{ \
v_extract_n < 0 > ( a0 ) , \
v_extract_n < 0 > ( a1 ) , \
@ -1831,7 +1833,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
v_extract_n < 0 > ( a3 ) \
} ; \
b0 = v_load ( elems0 ) ; \
_Tp CV_DECL_ALIGNED ( 32 ) elems1 [ 4 ] = \
_Tp elems1 [ 4 ] = \
{ \
v_extract_n < 1 > ( a0 ) , \
v_extract_n < 1 > ( a1 ) , \
@ -1839,7 +1841,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
v_extract_n < 1 > ( a3 ) \
} ; \
b1 = v_load ( elems1 ) ; \
_Tp CV_DECL_ALIGNED ( 32 ) elems2 [ 4 ] = \
_Tp elems2 [ 4 ] = \
{ \
v_extract_n < 2 > ( a0 ) , \
v_extract_n < 2 > ( a1 ) , \
@ -1847,7 +1849,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
v_extract_n < 2 > ( a3 ) \
} ; \
b2 = v_load ( elems2 ) ; \
_Tp CV_DECL_ALIGNED ( 32 ) elems3 [ 4 ] = \
_Tp elems3 [ 4 ] = \
{ \
v_extract_n < 3 > ( a0 ) , \
v_extract_n < 3 > ( a1 ) , \
@ -1866,8 +1868,8 @@ OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
# define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
inline _Tpvec v_reverse ( const _Tpvec & a ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptr [ _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptr [ _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptra , a ) ; \
for ( int i = 0 ; i < _Tpvec : : nlanes ; i + + ) \
{ \
@ -1894,8 +1896,8 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
# define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
inline void v_expand ( const _Tpvec & a , _Tpwvec & b0 , _Tpwvec & b1 ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) lptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) hptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp lptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp hptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
v_store_low ( lptr , a ) ; \
v_store_high ( hptr , a ) ; \
b0 = _Tpwvec ( wcvt ( vle # # width # # _v_ # # suffix # # mf2 ( lptr , vl ) , vl ) ) ; \
@ -1903,13 +1905,13 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
} \
inline _Tpwvec v_expand_low ( const _Tpvec & a ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) lptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp lptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
v_store_low ( lptr , a ) ; \
return _Tpwvec ( wcvt ( vle # # width # # _v_ # # suffix # # mf2 ( lptr , vl ) , vl ) ) ; \
} \
inline _Tpwvec v_expand_high ( const _Tpvec & a ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) hptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp hptr [ _Tpvec : : nlanes / 2 ] = { 0 } ; \
v_store_high ( hptr , a ) ; \
return _Tpwvec ( wcvt ( vle # # width # # _v_ # # suffix # # mf2 ( hptr , vl ) , vl ) ) ; \
} \
@ -1936,25 +1938,25 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
}
# define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr, hvl, vl) \
# define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, h width, width, hsuffix , suffix, rshr, shr, hvl, vl) \
inline _Tpvec v_pack ( const _wTpvec & a , const _wTpvec & b ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , b ) ; \
return _Tpvec ( shr ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) ; \
} \
inline void v_pack_store ( _Tp * ptr , const _wTpvec & a ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , hvl ) ) ) ; \
v_store ( ptr , _Tpvec ( shr ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) ) ; \
vse # # hwidth # # _v_ # # hsuffix # # m1 ( ptr , shr ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) , hvl ) ; \
} \
template < int n > inline \
_Tpvec v_rshr_pack ( const _wTpvec & a , const _wTpvec & b ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , b ) ; \
return _Tpvec ( rshr ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , n , vl ) ) ; \
@ -1962,39 +1964,39 @@ _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
template < int n > inline \
void v_rshr_pack_store ( _Tp * ptr , const _wTpvec & a ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , hvl ) ) ) ; \
v_store ( ptr , _Tpvec ( rshr ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , n , vl ) ) ) ; \
}
OPENCV_HAL_IMPL_RVV_PACK ( v_uint8x16 , uchar , v_uint16x8 , ushort , 16 , u16 , vnclipu_wx_u8m1 , vnclipu_wx_u8m1 , 8 , 16 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int8x16 , schar , v_int16x8 , short , 16 , i16 , vnclip_wx_i8m1 , vnclip_wx_i8m1 , 8 , 16 )
OPENCV_HAL_IMPL_RVV_PACK ( v_uint16x8 , ushort , v_uint32x4 , unsigned , 32 , u32 , vnclipu_wx_u16m1 , vnclipu_wx_u16m1 , 4 , 8 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int16x8 , short , v_int32x4 , int , 32 , i32 , vnclip_wx_i16m1 , vnclip_wx_i16m1 , 4 , 8 )
OPENCV_HAL_IMPL_RVV_PACK ( v_uint32x4 , unsigned , v_uint64x2 , uint64 , 64 , u64 , vnclipu_wx_u32m1 , vnsrl_wx_u32m1 , 2 , 4 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int32x4 , int , v_int64x2 , int64 , 64 , i64 , vnclip_wx_i32m1 , vnsra_wx_i32m1 , 2 , 4 )
OPENCV_HAL_IMPL_RVV_PACK ( v_uint8x16 , uchar , v_uint16x8 , ushort , 8 , 16 , u8 , u16 , vnclipu_wx_u8m1 , vnclipu_wx_u8m1 , 8 , 16 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int8x16 , schar , v_int16x8 , short , 8 , 16 , i8 , i16 , vnclip_wx_i8m1 , vnclip_wx_i8m1 , 8 , 16 )
OPENCV_HAL_IMPL_RVV_PACK ( v_uint16x8 , ushort , v_uint32x4 , unsigned , 16 , 32 , u16 , u32 , vnclipu_wx_u16m1 , vnclipu_wx_u16m1 , 4 , 8 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int16x8 , short , v_int32x4 , int , 16 , 32 , i16 , i32 , vnclip_wx_i16m1 , vnclip_wx_i16m1 , 4 , 8 )
OPENCV_HAL_IMPL_RVV_PACK ( v_uint32x4 , unsigned , v_uint64x2 , uint64 , 32 , 64 , u32 , u64 , vnclipu_wx_u32m1 , vnsrl_wx_u32m1 , 2 , 4 )
OPENCV_HAL_IMPL_RVV_PACK ( v_int32x4 , int , v_int64x2 , int64 , 32 , 64 , i32 , i64 , vnclip_wx_i32m1 , vnsra_wx_i32m1 , 2 , 4 )
# define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast, vl) \
# define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, h width, width, hsuffix, suffix, rshr, cast, hvl , vl) \
inline _Tpvec v_pack_u ( const _wTpvec & a , const _wTpvec & b ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , b ) ; \
return _Tpvec ( rshr ( cast ( vmax_vx_ # # suffix # # m2 ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) , 0 , vl ) ) ; \
} \
inline void v_pack_u_store ( _Tp * ptr , const _wTpvec & a ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , vl ) ) ) ; \
v_store ( ptr , _Tpvec ( rshr ( cast ( vmax_vx_ # # suffix # # m2 ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) , 0 , vl ) ) ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , h vl) ) ) ; \
vse # # hwidth # # _v_ # # hsuffix # # m1 ( ptr , rshr ( cast ( vmax_vx_ # # suffix # # m2 ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) , 0 , vl ) , hvl ) ; \
} \
template < int n > inline \
_Tpvec v_rshr_pack_u ( const _wTpvec & a , const _wTpvec & b ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , b ) ; \
return _Tpvec ( rshr ( cast ( vmax_vx_ # # suffix # # m2 ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) , n , vl ) ) ; \
@ -2002,23 +2004,23 @@ _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
template < int n > inline \
void v_rshr_pack_u_store ( _Tp * ptr , const _wTpvec & a ) \
{ \
_wTp CV_DECL_ALIGNED ( 32 ) arr [ _Tpvec : : nlanes ] = { 0 } ; \
_wTp arr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( arr , a ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , vl ) ) ) ; \
v_store ( arr + _wTpvec : : nlanes , _wTpvec ( vmv_v_x_ # # suffix # # m1 ( 0 , h vl) ) ) ; \
v_store ( ptr , _Tpvec ( rshr ( cast ( vmax_vx_ # # suffix # # m2 ( vle # # width # # _v_ # # suffix # # m2 ( arr , vl ) , 0 , vl ) ) , n , vl ) ) ) ; \
}
OPENCV_HAL_IMPL_RVV_PACK_U ( v_uint8x16 , uchar , v_int16x8 , short , 16 , i16 , vnclipu_wx_u8m1 , vreinterpret_v_i16m2_u16m2 , 16 )
OPENCV_HAL_IMPL_RVV_PACK_U ( v_uint16x8 , ushort , v_int32x4 , int , 32 , i32 , vnclipu_wx_u16m1 , vreinterpret_v_i32m2_u32m2 , 8 )
OPENCV_HAL_IMPL_RVV_PACK_U ( v_uint8x16 , uchar , v_int16x8 , short , 8 , 16 , u8 , i16 , vnclipu_wx_u8m1 , vreinterpret_v_i16m2_u16m2 , 8 , 16 )
OPENCV_HAL_IMPL_RVV_PACK_U ( v_uint16x8 , ushort , v_int32x4 , int , 16 , 32 , u16 , i32 , vnclipu_wx_u16m1 , vreinterpret_v_i32m2_u32m2 , 4 , 8 )
# define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
inline void v_zip ( const v_ # # _Tpvec & a0 , const v_ # # _Tpvec & a1 , v_ # # _Tpvec & b0 , v_ # # _Tpvec & b1 ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra0 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptra1 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb0 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb1 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra0 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra1 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb0 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb1 [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptra0 , a0 ) ; \
v_store ( ptra1 , a1 ) ; \
int i ; \
@ -2037,16 +2039,16 @@ inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_
} \
inline v_ # # _Tpvec v_combine_low ( const v_ # # _Tpvec & a , const v_ # # _Tpvec & b ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
v_store_low ( ptra , a ) ; \
v_store_low ( ptrb , b ) ; \
return v_load_halves ( ptra , ptrb ) ; \
} \
inline v_ # # _Tpvec v_combine_high ( const v_ # # _Tpvec & a , const v_ # # _Tpvec & b ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes / 2 ] = { 0 } ; \
v_store_high ( ptra , a ) ; \
v_store_high ( ptrb , b ) ; \
return v_load_halves ( ptra , ptrb ) ; \
@ -2072,8 +2074,8 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
# define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
inline void v_load_deinterleave ( const _Tp * ptr , v_ # # _Tpvec & a , v_ # # _Tpvec & b ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
int i , i2 ; \
for ( i = i2 = 0 ; i < v_ # # _Tpvec : : nlanes ; i + + , i2 + = 2 ) \
{ \
@ -2085,9 +2087,9 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
} \
inline void v_load_deinterleave ( const _Tp * ptr , v_ # # _Tpvec & a , v_ # # _Tpvec & b , v_ # # _Tpvec & c ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
int i , i3 ; \
for ( i = i3 = 0 ; i < v_ # # _Tpvec : : nlanes ; i + + , i3 + = 3 ) \
{ \
@ -2102,10 +2104,10 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_
inline void v_load_deinterleave ( const _Tp * ptr , v_ # # _Tpvec & a , v_ # # _Tpvec & b , \
v_ # # _Tpvec & c , v_ # # _Tpvec & d ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrd [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrd [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
int i , i4 ; \
for ( i = i4 = 0 ; i < v_ # # _Tpvec : : nlanes ; i + + , i4 + = 4 ) \
{ \
@ -2123,8 +2125,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
hal : : StoreMode /*mode*/ = hal : : STORE_UNALIGNED ) \
{ \
int i , i2 ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptra , a ) ; \
v_store ( ptrb , b ) ; \
for ( i = i2 = 0 ; i < v_ # # _Tpvec : : nlanes ; i + + , i2 + = 2 ) \
@ -2137,9 +2139,9 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
const v_ # # _Tpvec & c , hal : : StoreMode /*mode*/ = hal : : STORE_UNALIGNED ) \
{ \
int i , i3 ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptra , a ) ; \
v_store ( ptrb , b ) ; \
v_store ( ptrc , c ) ; \
@ -2155,10 +2157,10 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
hal : : StoreMode /*mode*/ = hal : : STORE_UNALIGNED ) \
{ \
int i , i4 ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrd [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptra [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrb [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrc [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrd [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptra , a ) ; \
v_store ( ptrb , b ) ; \
v_store ( ptrc , c ) ; \
@ -2173,8 +2175,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
} \
inline v_ # # _Tpvec v_interleave_pairs ( const v_ # # _Tpvec & vec ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptr [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrvec [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptr [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrvec [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptrvec , vec ) ; \
for ( int i = 0 ; i < v_ # # _Tpvec : : nlanes / 4 ; i + + ) \
{ \
@ -2187,8 +2189,8 @@ inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
} \
inline v_ # # _Tpvec v_interleave_quads ( const v_ # # _Tpvec & vec ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptr [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrvec [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptr [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptrvec [ v_ # # _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptrvec , vec ) ; \
for ( int i = 0 ; i < v_ # # _Tpvec : : nlanes / 8 ; i + + ) \
{ \
@ -2242,9 +2244,9 @@ static const unsigned char popCountTable[] =
# define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
inline _rTpvec v_popcount ( const _Tpvec & a ) \
{ \
uchar CV_DECL_ALIGNED ( 32 ) ptra [ 16 ] = { 0 } ; \
uchar ptra [ 16 ] = { 0 } ; \
v_store ( ptra , v_reinterpret_as_u8 ( a ) ) ; \
_rTp CV_DECL_ALIGNED ( 32 ) ptr [ _Tpvec : : nlanes ] = { 0 } ; \
_rTp ptr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptr , v_setzero_ # # suffix ( ) ) ; \
for ( int i = 0 ; i < _Tpvec : : nlanes * ( int ) sizeof ( _Tp ) ; i + + ) \
ptr [ i / sizeof ( _Tp ) ] + = popCountTable [ ptra [ i ] ] ; \
@ -2298,7 +2300,7 @@ inline int v_signmask(const v_float64x2& a)
# define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
inline int v_scan_forward ( const _Tpvec & a ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptr [ _Tpvec : : nlanes ] = { 0 } ; \
_Tp ptr [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptr , v_reinterpret_as_ # # suffix ( a ) ) ; \
for ( int i = 0 ; i < _Tpvec : : nlanes ; i + + ) \
if ( int ( ptr [ i ] ) < 0 ) \
@ -2321,28 +2323,29 @@ OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
//////////// Pack triplets ////////////
# define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
inline _Tpvec v_pack_triplets ( const _Tpvec & vec ) \
{ \
_Tp CV_DECL_ALIGNED ( 32 ) ptr [ _Tpvec : : nlanes ] = { 0 } ; \
_Tp CV_DECL_ALIGNED ( 32 ) ptrvec [ _Tpvec : : nlanes ] = { 0 } ; \
v_store ( ptrvec , vec ) ; \
for ( int i = 0 ; i < _Tpvec : : nlanes / 4 ; i + + ) \
{ \
ptr [ 3 * i ] = ptrvec [ 4 * i ] ; \
ptr [ 3 * i + 1 ] = ptrvec [ 4 * i + 2 ] ; \
ptr [ 3 * i + 2 ] = ptrvec [ 4 * i + 2 ] ; \
} \
return v_load ( ptr ) ; \
inline v_int8x16 v_pack_triplets ( const v_int8x16 & vec )
{
uint64 ptr [ 2 ] = { 0x0908060504020100 , 0xFFFFFFFF0E0D0C0A } ;
return v_int8x16 ( ( vint8m1_t ) vrgather_vv_u8m1 ( ( vuint8m1_t ) vint8m1_t ( vec ) , ( vuint8m1_t ) vle64_v_u64m1 ( ptr , 2 ) , 16 ) ) ;
}
inline v_uint8x16 v_pack_triplets ( const v_uint8x16 & vec )
{
return v_reinterpret_as_u8 ( v_pack_triplets ( v_reinterpret_as_s8 ( vec ) ) ) ;
}
inline v_int16x8 v_pack_triplets ( const v_int16x8 & vec )
{
uint64 ptr [ 2 ] = { 0x0908060504020100 , 0xFFFFFFFF0E0D0C0A } ;
return v_int16x8 ( ( vint16m1_t ) vrgather_vv_u8m1 ( ( vuint8m1_t ) vint16m1_t ( vec ) , ( vuint8m1_t ) vle64_v_u64m1 ( ptr , 2 ) , 16 ) ) ;
}
inline v_uint16x8 v_pack_triplets ( const v_uint16x8 & vec )
{
return v_reinterpret_as_u16 ( v_pack_triplets ( v_reinterpret_as_s16 ( vec ) ) ) ;
}
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_uint8x16 , uchar )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_int8x16 , schar )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_uint16x8 , ushort )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_int16x8 , short )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_uint32x4 , unsigned )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_int32x4 , int )
OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS ( v_float32x4 , float )
inline v_int32x4 v_pack_triplets ( const v_int32x4 & vec ) { return vec ; }
inline v_uint32x4 v_pack_triplets ( const v_uint32x4 & vec ) { return vec ; }
inline v_float32x4 v_pack_triplets ( const v_float32x4 & vec ) { return vec ; }
////// FP16 support ///////
@ -2443,7 +2446,7 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
// 16 >> 32
inline v_int32x4 v_dotprod ( const v_int16x8 & a , const v_int16x8 & b )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int ptr [ 8 ] = { 0 } ;
v_int32x4 t1 , t2 ;
vse32_v_i32m2 ( ptr , vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 ) ;
@ -2451,7 +2454,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
}
inline v_int32x4 v_dotprod ( const v_int16x8 & a , const v_int16x8 & b , const v_int32x4 & c )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int ptr [ 8 ] = { 0 } ;
v_int32x4 t1 , t2 ;
vse32_v_i32m2 ( ptr , vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 ) ;
@ -2461,7 +2464,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32
// 32 >> 64
inline v_int64x2 v_dotprod ( const v_int32x4 & a , const v_int32x4 & b )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
int64 ptr [ 4 ] = { 0 } ;
v_int64x2 t1 , t2 ;
vse64_v_i64m2 ( ptr , vwmul_vv_i64m2 ( a , b , 4 ) , 4 ) ;
v_load_deinterleave ( ptr , t1 , t2 ) ;
@ -2469,7 +2472,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
}
inline v_int64x2 v_dotprod ( const v_int32x4 & a , const v_int32x4 & b , const v_int64x2 & c )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
int64 ptr [ 4 ] = { 0 } ;
v_int64x2 t1 , t2 ;
vse64_v_i64m2 ( ptr , vwmul_vv_i64m2 ( a , b , 4 ) , 4 ) ;
v_load_deinterleave ( ptr , t1 , t2 ) ;
@ -2479,7 +2482,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
// 8 >> 32
inline v_uint32x4 v_dotprod_expand ( const v_uint8x16 & a , const v_uint8x16 & b )
{
unsigned CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
unsigned ptr [ 16 ] = { 0 } ;
v_uint32x4 t1 , t2 , t3 , t4 ;
vse32_v_u32m4 ( ptr , vwcvtu_x_x_v_u32m4 ( vwmulu_vv_u16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2488,7 +2491,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
inline v_uint32x4 v_dotprod_expand ( const v_uint8x16 & a , const v_uint8x16 & b ,
const v_uint32x4 & c )
{
unsigned CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
unsigned ptr [ 16 ] = { 0 } ;
v_uint32x4 t1 , t2 , t3 , t4 ;
vse32_v_u32m4 ( ptr , vwcvtu_x_x_v_u32m4 ( vwmulu_vv_u16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2497,7 +2500,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
inline v_int32x4 v_dotprod_expand ( const v_int8x16 & a , const v_int8x16 & b )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
int ptr [ 16 ] = { 0 } ;
v_int32x4 t1 , t2 , t3 , t4 ;
vse32_v_i32m4 ( ptr , vwcvt_x_x_v_i32m4 ( vwmul_vv_i16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2506,7 +2509,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
inline v_int32x4 v_dotprod_expand ( const v_int8x16 & a , const v_int8x16 & b ,
const v_int32x4 & c )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
int ptr [ 16 ] = { 0 } ;
v_int32x4 t1 , t2 , t3 , t4 ;
vse32_v_i32m4 ( ptr , vwcvt_x_x_v_i32m4 ( vwmul_vv_i16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2516,7 +2519,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
// 16 >> 64
inline v_uint64x2 v_dotprod_expand ( const v_uint16x8 & a , const v_uint16x8 & b )
{
uint64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
uint64 ptr [ 8 ] = { 0 } ;
v_uint64x2 t1 , t2 , t3 , t4 ;
vse64_v_u64m4 ( ptr , vwcvtu_x_x_v_u64m4 ( vwmulu_vv_u32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2524,7 +2527,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
}
inline v_uint64x2 v_dotprod_expand ( const v_uint16x8 & a , const v_uint16x8 & b , const v_uint64x2 & c )
{
uint64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
uint64 ptr [ 8 ] = { 0 } ;
v_uint64x2 t1 , t2 , t3 , t4 ;
vse64_v_u64m4 ( ptr , vwcvtu_x_x_v_u64m4 ( vwmulu_vv_u32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2533,7 +2536,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, con
inline v_int64x2 v_dotprod_expand ( const v_int16x8 & a , const v_int16x8 & b )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int64 ptr [ 8 ] = { 0 } ;
v_int64x2 t1 , t2 , t3 , t4 ;
vse64_v_i64m4 ( ptr , vwcvt_x_x_v_i64m4 ( vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2542,7 +2545,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
inline v_int64x2 v_dotprod_expand ( const v_int16x8 & a , const v_int16x8 & b ,
const v_int64x2 & c )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int64 ptr [ 8 ] = { 0 } ;
v_int64x2 t1 , t2 , t3 , t4 ;
vse64_v_i64m4 ( ptr , vwcvt_x_x_v_i64m4 ( vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_load_deinterleave ( ptr , t1 , t2 , t3 , t4 ) ;
@ -2563,7 +2566,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
// 16 >> 32
inline v_int32x4 v_dotprod_fast ( const v_int16x8 & a , const v_int16x8 & b )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int ptr [ 8 ] = { 0 } ;
vse32_v_i32m2 ( ptr , vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) ;
v_int32x4 t1 = v_load ( ptr ) ;
v_int32x4 t2 = v_load ( ptr + 4 ) ;
@ -2571,7 +2574,7 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
}
inline v_int32x4 v_dotprod_fast ( const v_int16x8 & a , const v_int16x8 & b , const v_int32x4 & c )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int ptr [ 8 ] = { 0 } ;
vse32_v_i32m2 ( ptr , vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) ;
v_int32x4 t1 = v_load ( ptr ) ;
v_int32x4 t2 = v_load ( ptr + 4 ) ;
@ -2581,7 +2584,7 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_
// 32 >> 64
inline v_int64x2 v_dotprod_fast ( const v_int32x4 & a , const v_int32x4 & b )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
int64 ptr [ 4 ] = { 0 } ;
vse64_v_i64m2 ( ptr , vwmul_vv_i64m2 ( a , b , 4 ) , 4 ) ;
v_int64x2 t1 = v_load ( ptr ) ;
v_int64x2 t2 = v_load ( ptr + 2 ) ;
@ -2589,7 +2592,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
}
inline v_int64x2 v_dotprod_fast ( const v_int32x4 & a , const v_int32x4 & b , const v_int64x2 & c )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 4 ] = { 0 } ;
int64 ptr [ 4 ] = { 0 } ;
vse64_v_i64m2 ( ptr , vwmul_vv_i64m2 ( a , b , 4 ) , 4 ) ;
v_int64x2 t1 = v_load ( ptr ) ;
v_int64x2 t2 = v_load ( ptr + 2 ) ;
@ -2600,7 +2603,7 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast ( const v_uint8x16 & a , const v_uint8x16 & b )
{
unsigned CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
unsigned ptr [ 16 ] = { 0 } ;
vse32_v_u32m4 ( ptr , vwcvtu_x_x_v_u32m4 ( vwmulu_vv_u16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_uint32x4 t1 = v_load ( ptr ) ;
v_uint32x4 t2 = v_load ( ptr + 4 ) ;
@ -2610,7 +2613,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
}
inline v_uint32x4 v_dotprod_expand_fast ( const v_uint8x16 & a , const v_uint8x16 & b , const v_uint32x4 & c )
{
unsigned CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
unsigned ptr [ 16 ] = { 0 } ;
vse32_v_u32m4 ( ptr , vwcvtu_x_x_v_u32m4 ( vwmulu_vv_u16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_uint32x4 t1 = v_load ( ptr ) ;
v_uint32x4 t2 = v_load ( ptr + 4 ) ;
@ -2620,7 +2623,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
}
inline v_int32x4 v_dotprod_expand_fast ( const v_int8x16 & a , const v_int8x16 & b )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
int ptr [ 16 ] = { 0 } ;
vse32_v_i32m4 ( ptr , vwcvt_x_x_v_i32m4 ( vwmul_vv_i16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_int32x4 t1 = v_load ( ptr ) ;
v_int32x4 t2 = v_load ( ptr + 4 ) ;
@ -2630,7 +2633,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
}
inline v_int32x4 v_dotprod_expand_fast ( const v_int8x16 & a , const v_int8x16 & b , const v_int32x4 & c )
{
int CV_DECL_ALIGNED ( 32 ) ptr [ 16 ] = { 0 } ;
int ptr [ 16 ] = { 0 } ;
vse32_v_i32m4 ( ptr , vwcvt_x_x_v_i32m4 ( vwmul_vv_i16m2 ( a , b , 16 ) , 16 ) , 16 ) ;
v_int32x4 t1 = v_load ( ptr ) ;
v_int32x4 t2 = v_load ( ptr + 4 ) ;
@ -2642,7 +2645,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, c
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast ( const v_uint16x8 & a , const v_uint16x8 & b )
{
uint64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
uint64 ptr [ 8 ] = { 0 } ;
vse64_v_u64m4 ( ptr , vwcvtu_x_x_v_u64m4 ( vwmulu_vv_u32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_uint64x2 t1 = v_load ( ptr ) ;
v_uint64x2 t2 = v_load ( ptr + 2 ) ;
@ -2652,7 +2655,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
}
inline v_uint64x2 v_dotprod_expand_fast ( const v_uint16x8 & a , const v_uint16x8 & b , const v_uint64x2 & c )
{
uint64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
uint64 ptr [ 8 ] = { 0 } ;
vse64_v_u64m4 ( ptr , vwcvtu_x_x_v_u64m4 ( vwmulu_vv_u32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_uint64x2 t1 = v_load ( ptr ) ;
v_uint64x2 t2 = v_load ( ptr + 2 ) ;
@ -2662,7 +2665,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
}
inline v_int64x2 v_dotprod_expand_fast ( const v_int16x8 & a , const v_int16x8 & b )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int64 ptr [ 8 ] = { 0 } ;
vse64_v_i64m4 ( ptr , vwcvt_x_x_v_i64m4 ( vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_int64x2 t1 = v_load ( ptr ) ;
v_int64x2 t2 = v_load ( ptr + 2 ) ;
@ -2672,7 +2675,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
}
inline v_int64x2 v_dotprod_expand_fast ( const v_int16x8 & a , const v_int16x8 & b , const v_int64x2 & c )
{
int64 CV_DECL_ALIGNED ( 32 ) ptr [ 8 ] = { 0 } ;
int64 ptr [ 8 ] = { 0 } ;
vse64_v_i64m4 ( ptr , vwcvt_x_x_v_i64m4 ( vwmul_vv_i32m2 ( a , b , 8 ) , 8 ) , 8 ) ;
v_int64x2 t1 = v_load ( ptr ) ;
v_int64x2 t2 = v_load ( ptr + 2 ) ;
@ -2714,7 +2717,7 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
# define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
inline void v_mul_expand ( const _Tpvec & a , const _Tpvec & b , _Tpwvec & c , _Tpwvec & d ) \
{ \
_Tpw CV_DECL_ALIGNED ( 32 ) ptr [ _Tpwvec : : nlanes * 2 ] = { 0 } ; \
_Tpw ptr [ _Tpwvec : : nlanes * 2 ] = { 0 } ; \
vse # # width # # _v_ # # suffix # # m2 ( ptr , wmul ( a , b , vl ) , vl ) ; \
c = _Tpwvec ( vle # # width # # _v_ # # suffix # # m1 ( ptr , hvl ) ) ; \
d = _Tpwvec ( vle # # width # # _v_ # # suffix # # m1 ( ptr + _Tpwvec : : nlanes , hvl ) ) ; \