|
|
|
@ -448,29 +448,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_floa |
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \ |
|
|
|
|
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
|
|
|
|
|
{ \
|
|
|
|
|
vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
} \
|
|
|
|
|
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
|
|
|
|
|
{ \
|
|
|
|
|
std::vector<uint> idx_; \
|
|
|
|
|
for (int i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
|
|
|
|
|
idx_.push_back(idx[i]); \
|
|
|
|
|
idx_.push_back(idx[i]+1); \
|
|
|
|
|
} \
|
|
|
|
|
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
} \
|
|
|
|
|
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
|
|
|
|
|
{ \
|
|
|
|
|
std::vector<uint> idx_; \
|
|
|
|
|
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
|
|
|
|
|
idx_.push_back(idx[i]); \
|
|
|
|
|
idx_.push_back(idx[i]+1); \
|
|
|
|
|
idx_.push_back(idx[i]+2); \
|
|
|
|
|
idx_.push_back(idx[i]+3); \
|
|
|
|
|
} \
|
|
|
|
|
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
} |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4) |
|
|
|
@ -482,6 +460,55 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \ |
|
|
|
|
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
|
|
|
|
|
{ \
|
|
|
|
|
auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
} |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2) |
|
|
|
|
#if CV_SIMD_SCALABLE_64F |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \ |
|
|
|
|
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
|
|
|
|
|
{ \
|
|
|
|
|
auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
|
|
|
|
|
auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
|
|
|
|
|
auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
|
|
|
|
} |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2) |
|
|
|
|
OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2) |
|
|
|
|
|
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \ |
|
|
|
|
inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
|
|
|
|
|
{ \
|
|
|
|
@ -512,7 +539,6 @@ inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_rein |
|
|
|
|
inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } |
|
|
|
|
inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } |
|
|
|
|
inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } |
|
|
|
|
inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); } |
|
|
|
|
|
|
|
|
|
////////////// Pack boolean ////////////////////
|
|
|
|
|
inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b) |
|
|
|
|