Merge pull request #26454 from fengyuentau/imgproc:update_warp_c4_kernels

imgproc: fix perf regressions on the c4 kernels of warpAffine / warpPerspective / remap #26454

## Performance

Previous performance regressions on c4 kernels are mainly on A311D https://github.com/opencv/opencv/pull/26348.
Regressions on c3 kernels on intel platform will be fixed in another pull request.

M2

```
Geometric mean (ms)

                                      Name of Test                                        base  patch   patch
                                                                                                          vs
                                                                                                         base
                                                                                                      (x-factor)
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)               0.338 0.163    2.08
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)               0.310 0.107    2.90
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)              0.344 0.162    2.13
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)              0.313 0.111    2.83
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)              0.676 0.333    2.03
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)              0.640 0.240    2.66
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)             1.212 0.885    1.37
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)             1.153 0.756    1.53
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)             0.950 0.475    2.00
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)             1.158 0.500    2.32
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)            3.441 3.106    1.11
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)            3.351 2.837    1.18
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)     0.336 0.163    2.07
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)     0.314 0.124    2.54
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)    0.385 0.226    1.70
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)    0.364 0.183    1.99
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)    0.541 0.290    1.87
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)    0.523 0.243    2.16
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)   1.540 1.239    1.24
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)   1.504 1.134    1.33
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)   0.751 0.465    1.62
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)   0.958 0.507    1.89
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)  3.785 3.487    1.09
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)  3.602 3.280    1.10
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    0.331 0.153    2.16
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    0.304 0.128    2.37
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   0.329 0.156    2.11
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   0.306 0.121    2.53
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  2.046 0.930    2.20
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  2.122 1.391    1.53
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 2.035 0.954    2.13
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 2.127 1.410    1.51
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    0.329 0.157    2.09
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    0.306 0.124    2.47
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   0.327 0.158    2.08
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   0.308 0.127    2.43
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  2.039 0.948    2.15
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  2.175 1.373    1.58
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 2.065 0.956    2.16
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 2.158 1.372    1.57
```

Intel i7-12700K:

```
Geometric mean (ms)

                                      Name of Test                                        base  patch   patch   
                                                                                                          vs    
                                                                                                         base   
                                                                                                      (x-factor)
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)               0.140 0.051    2.77   
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)               0.140 0.054    2.57   
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)              0.140 0.050    2.78   
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)              0.143 0.054    2.64   
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)              0.297 0.118    2.51   
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)              0.296 0.130    2.28   
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)             0.481 0.304    1.58   
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)             0.470 0.309    1.52   
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)             0.381 0.184    2.07   
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)             0.811 0.781    1.04   
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)            1.297 1.063    1.22   
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)            1.275 1.171    1.09   
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)     0.135 0.057    2.36   
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)     0.134 0.062    2.16   
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)    0.155 0.076    2.04   
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)    0.150 0.079    1.90   
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)    0.229 0.114    2.02   
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)    0.227 0.120    1.89   
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)   0.560 0.444    1.26   
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)   0.529 0.442    1.20   
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)   0.326 0.192    1.70   
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)   0.805 0.762    1.06   
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)  1.395 1.255    1.11   
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)  1.381 1.306    1.06   
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    0.138 0.049    2.81   
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    0.134 0.053    2.53   
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   0.137 0.049    2.79   
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   0.134 0.053    2.51   
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  1.362 1.352    1.01   
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  3.124 3.038    1.03   
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 1.354 1.351    1.00   
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 3.142 3.049    1.03   
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    0.140 0.052    2.70   
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    0.136 0.056    2.43   
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   0.139 0.051    2.70   
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   0.135 0.056    2.41   
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  1.335 1.345    0.99   
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  3.117 3.024    1.03   
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 1.327 1.319    1.01   
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 3.126 3.026    1.03   
```

A311D

```
Geometric mean (ms)

                                      Name of Test                                         base  patch    patch
                                                                                                            vs
                                                                                                           base
                                                                                                        (x-factor)
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)               1.762  1.361     1.29
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)               2.390  2.005     1.19
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)              1.747  1.238     1.41
WarpAffine::TestWarpAffine::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)              2.399  2.016     1.19
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)              3.917  3.104     1.26
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)              5.995  5.172     1.16
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)             6.711  5.460     1.23
WarpAffine::TestWarpAffine::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)             8.017  6.890     1.16
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)             6.269  5.596     1.12
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)             10.301 9.507     1.08
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)            18.871 17.375    1.09
WarpAffine::TestWarpAffine::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)            20.365 18.227    1.12
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)     2.083  1.514     1.38
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)     2.966  2.309     1.28
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)    2.358  1.715     1.37
WarpPerspective::TestWarpPerspective::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)    3.220  2.464     1.31
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 16UC4)    3.763  3.014     1.25
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_CONSTANT, 32FC4)    5.777  4.940     1.17
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 16UC4)   8.791  7.819     1.12
WarpPerspective::TestWarpPerspective::(1280x720, INTER_LINEAR, BORDER_REPLICATE, 32FC4)   10.165 8.426     1.21
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)   6.047  5.293     1.14
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)   9.851  9.023     1.09
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)  31.739 29.323    1.08
WarpPerspective::TestWarpPerspective::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)  32.439 29.236    1.11
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    1.759  1.441     1.22
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    2.681  2.270     1.18
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   1.774  1.425     1.24
map1_32fc1::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   2.672  2.252     1.19
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  14.079 9.334     1.51
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  17.770 16.155    1.10
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 15.872 11.192    1.42
map1_32fc1::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 19.167 15.342    1.25
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                    2.284  1.545     1.48
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                    3.040  2.231     1.36
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                   2.280  1.380     1.65
map1_32fc2::TestRemap::(640x480, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                   2.882  2.185     1.32
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 16UC4)                  15.877 11.381    1.40
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_CONSTANT, 32FC4)                  19.521 16.106    1.21
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 16UC4)                 15.950 11.532    1.38
map1_32fc2::TestRemap::(1920x1080, INTER_LINEAR, BORDER_REPLICATE, 32FC4)                 19.699 16.276    1.21
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/26403/merge
Yuantao Feng 1 week ago committed by GitHub
parent 8b84fcb376
commit ea0f9336e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 115
      modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
  2. 176
      modules/imgproc/src/warp_common.vector.hpp
  3. 243
      modules/imgproc/src/warp_kernels.simd.hpp

@ -543,6 +543,40 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m2_t, float, VTraits<v_float
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m2_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64)
#endif
template <int N = VTraits<v_uint16>::max_nlanes>
inline void v_store(ushort* ptr, const v_uint16& a)
{
ushort buf[VTraits<v_uint16>::max_nlanes];
v_store(buf, a);
for (int i = 0; i < N; i++) {
ptr[i] = buf[i];
}
}
template <> inline void v_store<8>(ushort* ptr, const v_uint16& a)
{
ushort buf[VTraits<v_uint16>::max_nlanes];
v_store(buf, a);
ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3];
ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7];
}
template <int N = VTraits<v_float32>::max_nlanes>
inline void v_store(float* ptr, const v_float32& a)
{
float buf[VTraits<v_float32>::max_nlanes];
v_store(buf, a);
for (int i = 0; i < N; i++) {
ptr[i] = buf[i];
}
}
template <> inline void v_store<4>(float* ptr, const v_float32& a)
{
float buf[VTraits<v_float32>::max_nlanes];
v_store(buf, a);
ptr[0] = buf[0]; ptr[1] = buf[1];
ptr[2] = buf[2]; ptr[3] = buf[3];
}
////////////// Lookup table access ////////////////////
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
@ -1616,6 +1650,42 @@ OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m4_t, v_int16, 16, i32, i16, __
OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m4_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x)
OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m4_t, v_int32, 32, i64, i32, __riscv_vwcvt_x)
template <int N = VTraits<v_float32>::max_nlanes>
inline v_float32 v_load(const float* ptr)
{
float buf[VTraits<v_float32>::max_nlanes];
v_store(buf, v_setzero_f32());
for (int i = 0; i < N; i++) {
buf[i] = ptr[i];
}
return v_load(buf);
}
template <> inline v_float32 v_load<4>(const float* ptr)
{
float buf[VTraits<v_float32>::max_nlanes];
v_store(buf, v_setzero_f32());
buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3];
return v_load(buf);
}
template <int N = VTraits<v_uint32>::max_nlanes>
inline v_uint32 v_load_expand(const ushort* ptr)
{
ushort buf[VTraits<v_uint16>::max_nlanes];
v_store(buf, v_setzero_u16());
for (int i = 0; i < N; i++) {
buf[i] = ptr[i];
}
return v_load_expand(buf);
}
template <> inline v_uint32 v_load_expand<4>(const ushort* ptr)
{
ushort buf[VTraits<v_uint16>::max_nlanes];
v_store(buf, v_setzero_u16());
buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3];
return v_load_expand(buf);
}
inline v_uint32 v_load_expand_q(const uchar* ptr)
{
return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf2(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
@ -1627,16 +1697,16 @@ inline v_int32 v_load_expand_q(const schar* ptr)
}
template <int N = VTraits<v_uint32>::max_nlanes>
inline v_uint32 v_load_expand_q(const uchar* ptr, int n = N)
inline v_uint32 v_load_expand_q(const uchar* ptr)
{
uchar buf[VTraits<v_uint8>::max_nlanes];
v_store(buf, v_setzero_u8());
for (int i = 0; i < n; i++) {
for (int i = 0; i < N; i++) {
buf[i] = ptr[i];
}
return v_load_expand_q(buf);
}
template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr, int n)
template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr)
{
uchar buf[VTraits<v_uint8>::max_nlanes];
v_store(buf, v_setzero_u8());
@ -1714,19 +1784,48 @@ void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, __riscv_vreinterpret_v_i32m4_u32m4, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
template <int N = VTraits<v_uint16>::max_nlanes>
inline v_uint16 v_pack_u(const v_int32& a, const v_int32& b)
{
ushort bufa[N];
ushort bufb[N];
v_pack_u_store(bufa, a);
v_pack_u_store(bufb, b);
ushort buf[N];
for (int i = 0; i < N; i++) {
buf[i] = bufa[i];
buf[i+N/2] = bufb[i];
}
return v_load(buf);
}
template <> inline v_uint16 v_pack_u<4>(const v_int32& a, const v_int32& b)
{
constexpr int N = VTraits<v_uint16>::max_nlanes;
ushort bufa[N];
ushort bufb[N];
v_pack_u_store(bufa, a);
v_pack_u_store(bufb, b);
ushort buf[N];
buf[0] = bufa[0]; buf[1] = bufa[1]; buf[2] = bufa[2]; buf[3] = bufa[3];
buf[4] = bufb[0]; buf[5] = bufb[1]; buf[6] = bufb[2]; buf[7] = bufb[3];
return v_load(buf);
}
template <int N = VTraits<v_int16>::max_nlanes>
inline void v_pack_u_store(uchar* ptr, const v_int16& a, int n = N)
inline void v_pack_store(uchar* ptr, const v_uint16& a)
{
uchar buf[VTraits<v_uint8>::max_nlanes];
v_pack_u_store(buf, a);
for (int i = 0; i < n; i++) {
v_pack_store(buf, a);
for (int i = 0; i < N; i++) {
ptr[i] = buf[i];
}
}
template <> inline void v_pack_u_store<8>(uchar* ptr, const v_int16& a, int n)
template <> inline void v_pack_store<8>(uchar* ptr, const v_uint16& a)
{
uchar buf[VTraits<v_uint8>::max_nlanes];
v_pack_u_store(buf, a);
v_pack_store(buf, a);
ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3];
ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7];
}

@ -569,16 +569,50 @@
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4() \
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs))); \
v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+4))); \
v_float32 i##ofs##_pix2 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep))); \
v_float32 i##ofs##_pix3 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep+4))); \
v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); \
v_float32 i##ofs##_pix1 = vx_load(srcptr##ofs+4); \
v_float32 i##ofs##_pix2 = vx_load(srcptr##ofs+srcstep); \
v_float32 i##ofs##_pix3 = vx_load(srcptr##ofs+srcstep+4); \
v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_STORE_8UC4_I() \
v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
v_pack_store(dstptr + 4*(x+i), i01_pix); \
v_pack_store(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMD128_STORE_16UC4_I() \
v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
vx_store(dstptr + 4*(x+i), i01_pix); \
vx_store(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMD128_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i0_pix0); \
vx_store(dstptr + 4*(x+i)+4, i1_pix0); \
vx_store(dstptr + 4*(x+i)+8, i2_pix0); \
vx_store(dstptr + 4*(x+i)+12, i3_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(DEPTH) \
for (int i = 0; i < uf; i+=vlanes_32) { \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(0); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(1); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(2); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(3); \
auto i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)), \
i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
v_pack_store(dstptr + 4*(x+i), i01_pix); \
v_pack_store(dstptr + 4*(x+i+2), i23_pix); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMD128_STORE_##DEPTH##C4_I(); \
}
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \
@ -602,16 +636,70 @@
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4() \
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \
const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \
const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \
v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0)), \
i##ofs0##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0+srcstep)); \
v_int32 i##ofs1##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1)), \
i##ofs1##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1+srcstep)); \
v_float32 i##ofs0##_fpix01 = v_cvt_f32(i##ofs0##_pix01), i##ofs0##_fpix23 = v_cvt_f32(i##ofs0##_pix23); \
v_float32 i##ofs1##_fpix01 = v_cvt_f32(i##ofs1##_pix01), i##ofs1##_fpix23 = v_cvt_f32(i##ofs1##_pix23); \
v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \
i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \
v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \
v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \
v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \
i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \
i##ofs0##_beta = vx_setall_f32(vbeta[i+ofs0]), \
i##ofs1##_beta = vx_setall_f32(vbeta[i+ofs1]); \
v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \
i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \
const float *srcptr##ofs0 = src + addr[i+ofs0]; \
const float *srcptr##ofs1 = src + addr[i+ofs1]; \
v_float32 i##ofs0##_fpix01 = v256_load(srcptr##ofs0), \
i##ofs0##_fpix23 = v256_load(srcptr##ofs0+srcstep); \
v_float32 i##ofs1##_fpix01 = v256_load(srcptr##ofs1), \
i##ofs1##_fpix23 = v256_load(srcptr##ofs1+srcstep); \
v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \
i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \
v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \
v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \
v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \
i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \
i##ofs0##_beta = vx_setall_f32(vbeta[i+ofs0]), \
i##ofs1##_beta = vx_setall_f32(vbeta[i+ofs1]); \
v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \
i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_STORE_8UC4_I() \
auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
#define CV_WARP_SIMD256_STORE_16UC4_I() \
auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
#define CV_WARP_SIMD256_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i01_fpix00); \
vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \
vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \
vx_store(dstptr + 4*(x+i)+24, i67_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(DEPTH) \
for (int i = 0; i < uf; i+=vlanes_32) { \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(0, 1); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(2, 3); \
auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(4, 5); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(6, 7); \
auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0, 1); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2, 3); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(4, 5); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(6, 7); \
CV_WARP_SIMD256_STORE_##DEPTH##C4_I(); \
}
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
@ -624,14 +712,48 @@
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4() \
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs))), \
i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+4))), \
i##ofs##_fpix2 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep))), \
i##ofs##_fpix3 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep+4))); \
v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs), \
i##ofs##_fpix1 = v_load<4>(srcptr##ofs+4), \
i##ofs##_fpix2 = v_load<4>(srcptr##ofs+srcstep), \
i##ofs##_fpix3 = v_load<4>(srcptr##ofs+srcstep+4); \
v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_STORE_8UC4_I() \
auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
v_pack_store<8>(dstptr + 4*(x+i), i01_pix); \
v_pack_store<8>(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMDX_STORE_16UC4_I() \
auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
v_store<8>(dstptr + 4*(x+i), i01_pix); \
v_store<8>(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMDX_STORE_32FC4_I() \
v_store<4>(dstptr + 4*(x+i), i0_fpix0); \
v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \
v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \
v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(DEPTH) \
for (int i = 0; i < uf; i+=4) { \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(0); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(1); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(2); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(3); \
auto i01_pix = v_pack(v_round(i0_fpix0), v_round(i1_fpix0)), \
i23_pix = v_pack(v_round(i2_fpix0), v_round(i3_fpix0)); \
v_pack_u_store<8>(dstptr + 4*(x+i), i01_pix); \
v_pack_u_store<8>(dstptr + 4*(x+i+2), i23_pix); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMDX_STORE_##DEPTH##C4_I(); \
}

@ -298,7 +298,7 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -418,7 +418,7 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -547,7 +547,7 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -590,11 +590,11 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
#endif
} else {
uint8_t pixbuf[max_uf*4*4];
@ -660,7 +660,7 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -761,7 +761,7 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -866,7 +866,7 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -876,7 +876,6 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
uint16_t pixbuf[max_uf*4*4];
uint16_t bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -904,18 +903,26 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
#endif
} else {
uint16_t pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -972,7 +979,7 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1071,7 +1078,7 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1176,7 +1183,7 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1186,7 +1193,6 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
float pixbuf[max_uf*4*4];
float bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -1218,16 +1224,25 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
#endif
} else {
float pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -1284,7 +1299,7 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1391,7 +1406,7 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1505,7 +1520,7 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1622,7 +1637,7 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1744,7 +1759,7 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1874,7 +1889,7 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -1917,11 +1932,11 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step,
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
#endif
} else {
uint8_t pixbuf[max_uf*4*4];
@ -1988,7 +2003,7 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2089,7 +2104,7 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2194,7 +2209,7 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2204,7 +2219,6 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
uint16_t pixbuf[max_uf*4*4];
uint16_t bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -2232,18 +2246,26 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
#endif
} else {
uint16_t pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -2301,7 +2323,7 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2401,7 +2423,7 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2507,7 +2529,7 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2517,7 +2539,6 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
float pixbuf[max_uf*4*4];
float bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -2549,16 +2570,25 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
#endif
} else {
float pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -2616,7 +2646,7 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2724,7 +2754,7 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2838,7 +2868,7 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -2961,7 +2991,7 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3100,7 +3130,7 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3247,7 +3277,7 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3292,11 +3322,11 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
#endif
} else {
uint8_t pixbuf[max_uf*4*4];
@ -3378,7 +3408,7 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3496,7 +3526,7 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3618,7 +3648,7 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3628,7 +3658,6 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
uint16_t pixbuf[max_uf*4*4];
uint16_t bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -3658,18 +3687,26 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
#endif
} else {
uint16_t pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -3742,7 +3779,7 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3859,7 +3896,7 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3982,7 +4019,7 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -3992,7 +4029,6 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
int32_t addr[max_uf],
src_ix[max_uf],
src_iy[max_uf];
float pixbuf[max_uf*4*4];
float bvalbuf[max_uf*4];
for (int i = 0; i < uf; i++) {
@ -4026,16 +4062,25 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
float valpha[max_uf], vbeta[max_uf];
vx_store(valpha, src_x0);
vx_store(valpha+vlanes_32, src_x1);
vx_store(vbeta, src_y0);
vx_store(vbeta+vlanes_32, src_y1);
#if CV_SIMD256
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD128
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
#elif CV_SIMD_SCALABLE
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
#endif
} else {
float pixbuf[max_uf*4*4];
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
}
#endif // (CV_SIMD || CV_SIMD_SCALABLE)
@ -4107,7 +4152,7 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -4229,7 +4274,7 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);
@ -4359,7 +4404,7 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
std::array<float, max_vlanes_32> start_indices;
std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
inner_scols = vx_setall_u32((unsigned)srccols - 1),
outer_srows = vx_setall_u32((unsigned)srcrows + 1),
outer_scols = vx_setall_u32((unsigned)srccols + 1);

Loading…
Cancel
Save