Merge pull request #24166 from hanliutong:rewrite-remaining

Rewrite Universal Intrinsic code: ImgProc (CV_SIMD_WIDTH related Part) #24166

Related PR: #24058, #24132. The goal of this series of PRs is to modify the SIMD code blocks in the opencv/modules/imgproc folder by using the new Universal Intrinsic API.

The modification of this PR mainly focuses on the code that uses the `CV_SIMD_WIDTH` macro. This macro is sometimes used for loop tail processing, such as `box_filter.simd.hpp` and `morph.simd.hpp`.

```cpp
#if CV_SIMD
int i = 0;
for (i < n - v_uint16::nlanes; i += v_uint16::nlanes) {
// some universal intrinsic code
// e.g. v_uint16...
}
#if CV_SIMD_WIDTH > 16
for (i < n - v_uint16x8::nlanes; i += v_uint16x8::nlanes) {
// handle loop tail by 128 bit SIMD
// e.g. v_uint16x8
}
#endif //CV_SIMD_WIDTH 
#endif// CV_SIMD
```
The main contradiction is that the variable-length Universal Intrinsic backend cannot use 128bit fixed-length data structures. Therefore, this PR uses the scalar loop to handle the loop tail.

This PR is marked as draft because the modification of the `box_filter.simd.hpp` file caused a compilation error. The cause of the error is initially believed to be due to an internal error in the GCC compiler.

```bash
box_filter.simd.hpp:1162:5: internal compiler error: Segmentation fault
 1162 |     }
      |     ^
0xe03883 crash_signal
        /wafer/share/gcc/gcc/toplev.cc:314
0x7ff261c4251f ???
        ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0
0x6bde48 hash_set<rtl_ssa::set_info*, false, default_hash_traits<rtl_ssa::set_info*> >::iterator::operator*()
        /wafer/share/gcc/gcc/hash-set.h:125
0x6bde48 extract_single_source
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1184
0x6bde48 extract_single_source
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1174
0x119ad9e pass_vsetvl::propagate_avl() const
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4087
0x119ceaf pass_vsetvl::execute(function*)
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4344
0x119ceaf pass_vsetvl::execute(function*)
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4325
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
```

This PR can be compiled with Clang 16, and `opencv_test_imgproc` is passed on QEMU.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/24302/head
HAN Liutong 1 year ago committed by GitHub
parent b870ad46bf
commit 320c0bf419
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 316
      modules/imgproc/src/box_filter.simd.hpp
  2. 170
      modules/imgproc/src/morph.simd.hpp
  3. 52
      modules/imgproc/src/shapedescr.cpp

@ -309,15 +309,15 @@ struct ColumnSum<int, uchar> :
{
const int* Sp = (const int*)src[0];
int i = 0;
#if CV_SIMD
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -339,37 +339,37 @@ struct ColumnSum<int, uchar> :
if( haveScale )
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 _v_scale = vx_setall_f32((float)_scale);
for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
v_uint16 v_dst = v_pack(v_s0d, v_s01d);
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -383,29 +383,29 @@ struct ColumnSum<int, uchar> :
else
{
int i = 0;
#if CV_SIMD
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
v_pack_store(D + i, v_dst);
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -480,15 +480,15 @@ public BaseColumnFilter
{
const ushort* Sp = (const ushort*)src[0];
int i = 0;
#if CV_SIMD
for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -510,27 +510,27 @@ public BaseColumnFilter
if( haveScale )
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
v_uint16 _dd8 = vx_setall_u16((ushort)dd);
for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
for( ; i <= width-VTraits<v_uint8>::vlanes(); i+=VTraits<v_uint8>::vlanes() )
{
v_uint16 _sm0 = vx_load(Sm + i);
v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
v_uint16 _sm1 = vx_load(Sm + i + VTraits<v_uint16>::vlanes());
v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits<v_uint16>::vlanes()), vx_load(Sp + i + VTraits<v_uint16>::vlanes()));
v_uint32 _s00, _s01, _s10, _s11;
v_expand(_s0 + _dd8, _s00, _s01);
v_expand(_s1 + _dd8, _s10, _s11);
v_expand(v_add(_s0, _dd8), _s00, _s01);
v_expand(v_add(_s1, _dd8), _s10, _s11);
_s00 = v_shr<SHIFT>(_s00*_ds4);
_s01 = v_shr<SHIFT>(_s01*_ds4);
_s10 = v_shr<SHIFT>(_s10*_ds4);
_s11 = v_shr<SHIFT>(_s11*_ds4);
_s00 = v_shr<SHIFT>(v_mul(_s00, _ds4));
_s01 = v_shr<SHIFT>(v_mul(_s01, _ds4));
_s10 = v_shr<SHIFT>(v_mul(_s10, _ds4));
_s11 = v_shr<SHIFT>(v_mul(_s11, _ds4));
v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@ -540,9 +540,9 @@ public BaseColumnFilter
v_store(D + i, v_pack_u(r0, r1));
v_store(SUM + i, _s0);
v_store(SUM + i + v_uint16::nlanes, _s1);
v_store(SUM + i + VTraits<v_uint16>::vlanes(), _s1);
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
v_uint16x8 dd8 = v_setall_u16((ushort)dd);
@ -556,13 +556,13 @@ public BaseColumnFilter
v_uint32x4 _s00, _s01, _s10, _s11;
v_expand(_s0 + dd8, _s00, _s01);
v_expand(_s1 + dd8, _s10, _s11);
v_expand(v_add(_s0, dd8), _s00, _s01);
v_expand(v_add(_s1, dd8), _s10, _s11);
_s00 = v_shr<SHIFT>(_s00*ds4);
_s01 = v_shr<SHIFT>(_s01*ds4);
_s10 = v_shr<SHIFT>(_s10*ds4);
_s11 = v_shr<SHIFT>(_s11*ds4);
_s00 = v_shr<SHIFT>(v_mul(_s00, ds4));
_s01 = v_shr<SHIFT>(v_mul(_s01, ds4));
_s10 = v_shr<SHIFT>(v_mul(_s10, ds4));
_s11 = v_shr<SHIFT>(v_mul(_s11, ds4));
v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@ -643,15 +643,15 @@ struct ColumnSum<int, short> :
{
const int* Sp = (const int*)src[0];
i = 0;
#if CV_SIMD
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -673,33 +673,33 @@ struct ColumnSum<int, short> :
if( haveScale )
{
i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 _v_scale = vx_setall_f32((float)_scale);
for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -713,27 +713,27 @@ struct ColumnSum<int, short> :
else
{
i = 0;
#if CV_SIMD
for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_store(D + i, v_pack(v_s0, v_s01));
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_store(D + i, v_pack(v_s0, v_s01));
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -792,15 +792,15 @@ struct ColumnSum<int, ushort> :
{
const int* Sp = (const int*)src[0];
int i = 0;
#if CV_SIMD
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -822,33 +822,33 @@ struct ColumnSum<int, ushort> :
if( haveScale )
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 _v_scale = vx_setall_f32((float)_scale);
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
v_store(D + i, v_pack(v_s0d, v_s01d));
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -862,27 +862,27 @@ struct ColumnSum<int, ushort> :
else
{
int i = 0;
#if CV_SIMD
for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
}
#endif
#endif
@ -939,15 +939,15 @@ struct ColumnSum<int, int> :
{
const int* Sp = (const int*)src[0];
int i = 0;
#if CV_SIMD
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -969,25 +969,25 @@ struct ColumnSum<int, int> :
if( haveScale )
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 _v_scale = vx_setall_f32((float)_scale);
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
v_store(D + i, v_s0d);
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
v_store(D + i, v_s0d);
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
}
#endif
#endif
@ -1001,21 +1001,21 @@ struct ColumnSum<int, int> :
else
{
int i = 0;
#if CV_SIMD
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_store(D + i, v_s0);
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_store(D + i, v_s0);
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
}
#endif
#endif
@ -1073,15 +1073,15 @@ struct ColumnSum<int, float> :
{
const int* Sp = (const int*)src[0];
int i = 0;
#if CV_SIMD
for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
}
#endif
#endif
@ -1105,21 +1105,21 @@ struct ColumnSum<int, float> :
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 _v_scale = vx_setall_f32((float)_scale);
for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
v_float32x4 v_scale = v_setall_f32((float)_scale);
for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_store(D + i, v_cvt_f32(v_s0) * v_scale);
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
}
#endif
#endif
@ -1134,19 +1134,19 @@ struct ColumnSum<int, float> :
{
int i = 0;
#if CV_SIMD
for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
{
v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
v_store(D + i, v_cvt_f32(v_s0));
v_store(SUM + i, v_s0 - vx_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
}
#if CV_SIMD_WIDTH > 16
#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
{
v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
v_store(D + i, v_cvt_f32(v_s0));
v_store(SUM + i, v_s0 - v_load(Sm + i));
v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
}
#endif
#endif

@ -106,12 +106,12 @@ struct MorphNoVec
int operator()(uchar**, int, uchar*, int) const { return 0; }
};
#if CV_SIMD
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
template<class VecUpdate> struct MorphRowVec
{
typedef typename VecUpdate::vtype vtype;
typedef typename vtype::lane_type stype;
typedef typename VTraits<vtype>::lane_type stype;
MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
int operator()(const uchar* src, uchar* dst, int width, int cn) const
{
@ -121,52 +121,52 @@ template<class VecUpdate> struct MorphRowVec
width *= cn;
VecUpdate updateOp;
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
{
vtype s0 = vx_load((const stype*)src + i);
vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
vtype s2 = vx_load((const stype*)src + i + 2*VTraits<vtype>::vlanes());
vtype s3 = vx_load((const stype*)src + i + 3*VTraits<vtype>::vlanes());
for (k = cn; k < _ksize; k += cn)
{
s0 = updateOp(s0, vx_load((const stype*)src + i + k));
s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits<vtype>::vlanes()));
s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits<vtype>::vlanes()));
}
v_store((stype*)dst + i, s0);
v_store((stype*)dst + i + vtype::nlanes, s1);
v_store((stype*)dst + i + 2*vtype::nlanes, s2);
v_store((stype*)dst + i + 3*vtype::nlanes, s3);
v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
v_store((stype*)dst + i + 2*VTraits<vtype>::vlanes(), s2);
v_store((stype*)dst + i + 3*VTraits<vtype>::vlanes(), s3);
}
if( i <= width - 2*vtype::nlanes )
if( i <= width - 2*VTraits<vtype>::vlanes() )
{
vtype s0 = vx_load((const stype*)src + i);
vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
for( k = cn; k < _ksize; k += cn )
{
s0 = updateOp(s0, vx_load((const stype*)src + i + k));
s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
}
v_store((stype*)dst + i, s0);
v_store((stype*)dst + i + vtype::nlanes, s1);
i += 2*vtype::nlanes;
v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
i += 2*VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes )
if( i <= width - VTraits<vtype>::vlanes() )
{
vtype s = vx_load((const stype*)src + i);
for( k = cn; k < _ksize; k += cn )
s = updateOp(s, vx_load((const stype*)src + i + k));
v_store((stype*)dst + i, s);
i += vtype::nlanes;
i += VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes/2 )
if( i <= width - VTraits<vtype>::vlanes()/2 )
{
vtype s = vx_load_low((const stype*)src + i);
for( k = cn; k < _ksize; k += cn )
s = updateOp(s, vx_load_low((const stype*)src + i + k));
v_store_low((stype*)dst + i, s);
i += vtype::nlanes/2;
i += VTraits<vtype>::vlanes()/2;
}
return i - i % cn;
@ -179,7 +179,7 @@ template<class VecUpdate> struct MorphRowVec
template<class VecUpdate> struct MorphColumnVec
{
typedef typename VecUpdate::vtype vtype;
typedef typename vtype::lane_type stype;
typedef typename VTraits<vtype>::lane_type stype;
MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
{
@ -189,7 +189,7 @@ template<class VecUpdate> struct MorphColumnVec
VecUpdate updateOp;
for( i = 0; i < count + ksize - 1; i++ )
CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
CV_Assert( ((size_t)_src[i] & (VTraits<v_uint8>::vlanes()-1)) == 0 );
const stype** src = (const stype**)_src;
stype* dst = (stype*)_dst;
@ -197,58 +197,58 @@ template<class VecUpdate> struct MorphColumnVec
for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
{
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
{
const stype* sptr = src[1] + i;
vtype s0 = vx_load_aligned(sptr);
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
for( k = 2; k < _ksize; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load_aligned(sptr));
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
}
sptr = src[0] + i;
v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
v_store(dst + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
v_store(dst + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
sptr = src[k] + i;
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
v_store(dst + dststep + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
v_store(dst + dststep + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
}
if( i <= width - 2*vtype::nlanes )
if( i <= width - 2*VTraits<vtype>::vlanes() )
{
const stype* sptr = src[1] + i;
vtype s0 = vx_load_aligned(sptr);
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
for( k = 2; k < _ksize; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load_aligned(sptr));
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
}
sptr = src[0] + i;
v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
sptr = src[k] + i;
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
i += 2*vtype::nlanes;
v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
i += 2*VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes )
if( i <= width - VTraits<vtype>::vlanes() )
{
vtype s0 = vx_load_aligned(src[1] + i);
@ -257,9 +257,9 @@ template<class VecUpdate> struct MorphColumnVec
v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
i += vtype::nlanes;
i += VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes/2 )
if( i <= width - VTraits<vtype>::vlanes()/2 )
{
vtype s0 = vx_load_low(src[1] + i);
@ -268,66 +268,66 @@ template<class VecUpdate> struct MorphColumnVec
v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
i += vtype::nlanes/2;
i += VTraits<vtype>::vlanes()/2;
}
}
for( ; count > 0; count--, dst += dststep, src++ )
{
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
{
const stype* sptr = src[0] + i;
vtype s0 = vx_load_aligned(sptr);
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
for( k = 1; k < _ksize; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load_aligned(sptr));
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
}
v_store(dst + i, s0);
v_store(dst + i + vtype::nlanes, s1);
v_store(dst + i + 2*vtype::nlanes, s2);
v_store(dst + i + 3*vtype::nlanes, s3);
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
}
if( i <= width - 2*vtype::nlanes )
if( i <= width - 2*VTraits<vtype>::vlanes() )
{
const stype* sptr = src[0] + i;
vtype s0 = vx_load_aligned(sptr);
vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
for( k = 1; k < _ksize; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load_aligned(sptr));
s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
}
v_store(dst + i, s0);
v_store(dst + i + vtype::nlanes, s1);
i += 2*vtype::nlanes;
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
i += 2*VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes )
if( i <= width - VTraits<vtype>::vlanes() )
{
vtype s0 = vx_load_aligned(src[0] + i);
for( k = 1; k < _ksize; k++ )
s0 = updateOp(s0, vx_load_aligned(src[k] + i));
v_store(dst + i, s0);
i += vtype::nlanes;
i += VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes/2 )
if( i <= width - VTraits<vtype>::vlanes()/2 )
{
vtype s0 = vx_load_low(src[0] + i);
for( k = 1; k < _ksize; k++ )
s0 = updateOp(s0, vx_load_low(src[k] + i));
v_store_low(dst + i, s0);
i += vtype::nlanes/2;
i += VTraits<vtype>::vlanes()/2;
}
}
@ -341,7 +341,7 @@ template<class VecUpdate> struct MorphColumnVec
template<class VecUpdate> struct MorphVec
{
typedef typename VecUpdate::vtype vtype;
typedef typename vtype::lane_type stype;
typedef typename VTraits<vtype>::lane_type stype;
int operator()(uchar** _src, int nz, uchar* _dst, int width) const
{
CV_INSTRUMENT_REGION();
@ -351,56 +351,56 @@ template<class VecUpdate> struct MorphVec
int i, k;
VecUpdate updateOp;
for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
{
const stype* sptr = src[0] + i;
vtype s0 = vx_load(sptr);
vtype s1 = vx_load(sptr + vtype::nlanes);
vtype s2 = vx_load(sptr + 2*vtype::nlanes);
vtype s3 = vx_load(sptr + 3*vtype::nlanes);
vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
vtype s2 = vx_load(sptr + 2*VTraits<vtype>::vlanes());
vtype s3 = vx_load(sptr + 3*VTraits<vtype>::vlanes());
for( k = 1; k < nz; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load(sptr));
s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
s2 = updateOp(s2, vx_load(sptr + 2*VTraits<vtype>::vlanes()));
s3 = updateOp(s3, vx_load(sptr + 3*VTraits<vtype>::vlanes()));
}
v_store(dst + i, s0);
v_store(dst + i + vtype::nlanes, s1);
v_store(dst + i + 2*vtype::nlanes, s2);
v_store(dst + i + 3*vtype::nlanes, s3);
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
}
if( i <= width - 2*vtype::nlanes )
if( i <= width - 2*VTraits<vtype>::vlanes() )
{
const stype* sptr = src[0] + i;
vtype s0 = vx_load(sptr);
vtype s1 = vx_load(sptr + vtype::nlanes);
vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
for( k = 1; k < nz; k++ )
{
sptr = src[k] + i;
s0 = updateOp(s0, vx_load(sptr));
s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
}
v_store(dst + i, s0);
v_store(dst + i + vtype::nlanes, s1);
i += 2*vtype::nlanes;
v_store(dst + i + VTraits<vtype>::vlanes(), s1);
i += 2*VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes )
if( i <= width - VTraits<vtype>::vlanes() )
{
vtype s0 = vx_load(src[0] + i);
for( k = 1; k < nz; k++ )
s0 = updateOp(s0, vx_load(src[k] + i));
v_store(dst + i, s0);
i += vtype::nlanes;
i += VTraits<vtype>::vlanes();
}
if( i <= width - vtype::nlanes/2 )
if( i <= width - VTraits<vtype>::vlanes()/2 )
{
vtype s0 = vx_load_low(src[0] + i);
for( k = 1; k < nz; k++ )
s0 = updateOp(s0, vx_load_low(src[k] + i));
v_store_low(dst + i, s0);
i += vtype::nlanes/2;
i += VTraits<vtype>::vlanes()/2;
}
return i;
}

@ -879,14 +879,14 @@ static Rect pointSetBoundingRect( const Mat& points )
if( npoints == 0 )
return Rect();
#if CV_SIMD
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related.
const int64_t* pts = points.ptr<int64_t>();
if( !is_float )
{
v_int32 minval, maxval;
minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
for( i = 1; i <= npoints - VTraits<v_int32>::vlanes()/2; i+= VTraits<v_int32>::vlanes()/2 )
{
v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
minval = v_min(ptXY2, minval);
@ -894,22 +894,22 @@ static Rect pointSetBoundingRect( const Mat& points )
}
minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
if( i <= npoints - v_int32::nlanes/4 )
if( i <= npoints - VTraits<v_int32>::vlanes()/4 )
{
v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
minval = v_min(ptXY, minval);
maxval = v_max(ptXY, maxval);
i += v_int64::nlanes/2;
i += VTraits<v_int64>::vlanes()/2;
}
for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
{
minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
}
xmin = minval.get0();
xmax = maxval.get0();
ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
xmin = v_get0(minval);
xmax = v_get0(maxval);
ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
#if CV_SIMD_WIDTH > 16
if( i < npoints )
{
@ -921,18 +921,18 @@ static Rect pointSetBoundingRect( const Mat& points )
minval2 = v_min(ptXY, minval2);
maxval2 = v_max(ptXY, maxval2);
}
xmin = min(xmin, minval2.get0());
xmax = max(xmax, maxval2.get0());
ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
xmin = min(xmin, v_get0(minval2));
xmax = max(xmax, v_get0(maxval2));
ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2)))));
ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2)))));
}
#endif
#endif // CV_SIMD
}
else
{
v_float32 minval, maxval;
minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
for( i = 1; i <= npoints - VTraits<v_float32>::vlanes()/2; i+= VTraits<v_float32>::vlanes()/2 )
{
v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
minval = v_min(ptXY2, minval);
@ -940,22 +940,22 @@ static Rect pointSetBoundingRect( const Mat& points )
}
minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
if( i <= npoints - v_float32::nlanes/4 )
if( i <= npoints - VTraits<v_float32>::vlanes()/4 )
{
v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
minval = v_min(ptXY, minval);
maxval = v_max(ptXY, maxval);
i += v_float32::nlanes/4;
i += VTraits<v_float32>::vlanes()/4;
}
for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
{
minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
}
xmin = cvFloor(minval.get0());
xmax = cvFloor(maxval.get0());
ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
xmin = cvFloor(v_get0(minval));
xmax = cvFloor(v_get0(maxval));
ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))));
ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))));
#if CV_SIMD_WIDTH > 16
if( i < npoints )
{
@ -967,10 +967,10 @@ static Rect pointSetBoundingRect( const Mat& points )
minval2 = v_min(ptXY, minval2);
maxval2 = v_max(ptXY, maxval2);
}
xmin = min(xmin, cvFloor(minval2.get0()));
xmax = max(xmax, cvFloor(maxval2.get0()));
ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
xmin = min(xmin, cvFloor(v_get0(minval2)));
xmax = max(xmax, cvFloor(v_get0(maxval2)));
ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))))));
ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))))));
}
#endif
}

Loading…
Cancel
Save