From 320c0bf419043e0179d47742f9f0ebd7885b5457 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Tue, 19 Sep 2023 20:12:52 +0800 Subject: [PATCH] Merge pull request #24166 from hanliutong:rewrite-remaining Rewrite Universal Intrinsic code: ImgProc (CV_SIMD_WIDTH related Part) #24166 Related PR: #24058, #24132. The goal of this series of PRs is to modify the SIMD code blocks in the opencv/modules/imgproc folder by using the new Universal Intrinsic API. The modification of this PR mainly focuses on the code that uses the `CV_SIMD_WIDTH` macro. This macro is sometimes used for loop tail processing, such as `box_filter.simd.hpp` and `morph.simd.hpp`. ```cpp #if CV_SIMD int i = 0; for (i < n - v_uint16::nlanes; i += v_uint16::nlanes) { // some universal intrinsic code // e.g. v_uint16... } #if CV_SIMD_WIDTH > 16 for (i < n - v_uint16x8::nlanes; i += v_uint16x8::nlanes) { // handle loop tail by 128 bit SIMD // e.g. v_uint16x8 } #endif //CV_SIMD_WIDTH #endif// CV_SIMD ``` The main contradiction is that the variable-length Universal Intrinsic backend cannot use 128bit fixed-length data structures. Therefore, this PR uses the scalar loop to handle the loop tail. This PR is marked as draft because the modification of the `box_filter.simd.hpp` file caused a compilation error. The cause of the error is initially believed to be due to an internal error in the GCC compiler. ```bash box_filter.simd.hpp:1162:5: internal compiler error: Segmentation fault 1162 | } | ^ 0xe03883 crash_signal /wafer/share/gcc/gcc/toplev.cc:314 0x7ff261c4251f ??? ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 0x6bde48 hash_set >::iterator::operator*() /wafer/share/gcc/gcc/hash-set.h:125 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1184 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1174 0x119ad9e pass_vsetvl::propagate_avl() const /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4087 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4344 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4325 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. ``` This PR can be compiled with Clang 16, and `opencv_test_imgproc` is passed on QEMU. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/imgproc/src/box_filter.simd.hpp | 316 ++++++++++++------------ modules/imgproc/src/morph.simd.hpp | 170 ++++++------- modules/imgproc/src/shapedescr.cpp | 52 ++-- 3 files changed, 269 insertions(+), 269 deletions(-) diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp index 4a4d205216..735935c04f 100644 --- a/modules/imgproc/src/box_filter.simd.hpp +++ b/modules/imgproc/src/box_filter.simd.hpp @@ -309,15 +309,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -339,37 +339,37 @@ struct ColumnSum : if( haveScale ) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); + v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale))); + v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale))); v_uint16 v_dst = v_pack(v_s0d, v_s01d); v_pack_store(D + i, v_dst); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale))); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale))); v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); v_pack_store(D + i, v_dst); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -383,29 +383,29 @@ struct ColumnSum : else { int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); v_pack_store(D + i, v_dst); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); v_pack_store(D + i, v_dst); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -480,15 +480,15 @@ public BaseColumnFilter { const ushort* Sp = (const ushort*)src[0]; int i = 0; -#if CV_SIMD - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width - VTraits::vlanes(); i += VTraits::vlanes() ) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes ) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -510,27 +510,27 @@ public BaseColumnFilter if( haveScale ) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint32 _ds4 = vx_setall_u32((unsigned)ds); v_uint16 _dd8 = vx_setall_u16((ushort)dd); - for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { v_uint16 _sm0 = vx_load(Sm + i); - v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes); + v_uint16 _sm1 = vx_load(Sm + i + VTraits::vlanes()); v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i)); - v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes)); + v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); v_uint32 _s00, _s01, _s10, _s11; - v_expand(_s0 + _dd8, _s00, _s01); - v_expand(_s1 + _dd8, _s10, _s11); + v_expand(v_add(_s0, _dd8), _s00, _s01); + v_expand(v_add(_s1, _dd8), _s10, _s11); - _s00 = v_shr(_s00*_ds4); - _s01 = v_shr(_s01*_ds4); - _s10 = v_shr(_s10*_ds4); - _s11 = v_shr(_s11*_ds4); + _s00 = v_shr(v_mul(_s00, _ds4)); + _s01 = v_shr(v_mul(_s01, _ds4)); + _s10 = v_shr(v_mul(_s10, _ds4)); + _s11 = v_shr(v_mul(_s11, _ds4)); v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); @@ -540,9 +540,9 @@ public BaseColumnFilter v_store(D + i, v_pack_u(r0, r1)); v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16::nlanes, _s1); + v_store(SUM + i + VTraits::vlanes(), _s1); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_uint32x4 ds4 = v_setall_u32((unsigned)ds); v_uint16x8 dd8 = v_setall_u16((ushort)dd); @@ -556,13 +556,13 @@ public BaseColumnFilter v_uint32x4 _s00, _s01, _s10, _s11; - v_expand(_s0 + dd8, _s00, _s01); - v_expand(_s1 + dd8, _s10, _s11); + v_expand(v_add(_s0, dd8), _s00, _s01); + v_expand(v_add(_s1, dd8), _s10, _s11); - _s00 = v_shr(_s00*ds4); - _s01 = v_shr(_s01*ds4); - _s10 = v_shr(_s10*ds4); - _s11 = v_shr(_s11*ds4); + _s00 = v_shr(v_mul(_s00, ds4)); + _s01 = v_shr(v_mul(_s01, ds4)); + _s10 = v_shr(v_mul(_s10, ds4)); + _s11 = v_shr(v_mul(_s11, ds4)); v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); @@ -643,15 +643,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -673,33 +673,33 @@ struct ColumnSum : if( haveScale ) { i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); - v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale); + v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale)); + v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale)); v_store(D + i, v_pack(v_s0d, v_s01d)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); + v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale)); + v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale)); v_store(D + i, v_pack(v_s0d, v_s01d)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -713,27 +713,27 @@ struct ColumnSum : else { i = 0; -#if CV_SIMD - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) +#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); v_store(D + i, v_pack(v_s0, v_s01)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); v_store(D + i, v_pack(v_s0, v_s01)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -792,15 +792,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -822,33 +822,33 @@ struct ColumnSum : if( haveScale ) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); + v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale))); + v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale))); v_store(D + i, v_pack(v_s0d, v_s01d)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale))); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale))); v_store(D + i, v_pack(v_s0d, v_s01d)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -862,27 +862,27 @@ struct ColumnSum : else { int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits::vlanes()), vx_load(Sp + i + VTraits::vlanes())); v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); + v_store(SUM + i + VTraits::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits::vlanes()))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes)); v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); + v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes))); } #endif #endif @@ -939,15 +939,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -969,25 +969,25 @@ struct ColumnSum : if( haveScale ) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale)); v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale)); v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); } #endif #endif @@ -1001,21 +1001,21 @@ struct ColumnSum : else { int i = 0; -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); } #endif #endif @@ -1073,15 +1073,15 @@ struct ColumnSum : { const int* Sp = (const int*)src[0]; int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width - VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i))); } #endif #endif @@ -1105,21 +1105,21 @@ struct ColumnSum : { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 _v_scale = vx_setall_f32((float)_scale); - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) + for (; i <= width - VTraits::vlanes(); i += VTraits::vlanes()) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * _v_scale); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); + v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 v_float32x4 v_scale = v_setall_f32((float)_scale); for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * v_scale); - v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); + v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); } #endif #endif @@ -1134,19 +1134,19 @@ struct ColumnSum : { int i = 0; -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( ; i <= width-VTraits::vlanes(); i+=VTraits::vlanes() ) { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i)); v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i))); } -#if CV_SIMD_WIDTH > 16 +#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16 for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i)); v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i, v_sub(v_s0, v_load(Sm + i))); } #endif #endif diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp index 9b3023f8f0..ef813dccec 100644 --- a/modules/imgproc/src/morph.simd.hpp +++ b/modules/imgproc/src/morph.simd.hpp @@ -106,12 +106,12 @@ struct MorphNoVec int operator()(uchar**, int, uchar*, int) const { return 0; } }; -#if CV_SIMD +#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related template struct MorphRowVec { typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; + typedef typename VTraits::lane_type stype; MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar* src, uchar* dst, int width, int cn) const { @@ -121,52 +121,52 @@ template struct MorphRowVec width *= cn; VecUpdate updateOp; - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) + for( i = 0; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); - vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); + vtype s1 = vx_load((const stype*)src + i + VTraits::vlanes()); + vtype s2 = vx_load((const stype*)src + i + 2*VTraits::vlanes()); + vtype s3 = vx_load((const stype*)src + i + 3*VTraits::vlanes()); for (k = cn; k < _ksize; k += cn) { s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits::vlanes())); + s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits::vlanes())); + s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits::vlanes())); } v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - v_store((stype*)dst + i + 2*vtype::nlanes, s2); - v_store((stype*)dst + i + 3*vtype::nlanes, s3); + v_store((stype*)dst + i + VTraits::vlanes(), s1); + v_store((stype*)dst + i + 2*VTraits::vlanes(), s2); + v_store((stype*)dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*vtype::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); + vtype s1 = vx_load((const stype*)src + i + VTraits::vlanes()); for( k = cn; k < _ksize; k += cn ) { s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits::vlanes())); } v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; + v_store((stype*)dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - vtype::nlanes ) + if( i <= width - VTraits::vlanes() ) { vtype s = vx_load((const stype*)src + i); for( k = cn; k < _ksize; k += cn ) s = updateOp(s, vx_load((const stype*)src + i + k)); v_store((stype*)dst + i, s); - i += vtype::nlanes; + i += VTraits::vlanes(); } - if( i <= width - vtype::nlanes/2 ) + if( i <= width - VTraits::vlanes()/2 ) { vtype s = vx_load_low((const stype*)src + i); for( k = cn; k < _ksize; k += cn ) s = updateOp(s, vx_load_low((const stype*)src + i + k)); v_store_low((stype*)dst + i, s); - i += vtype::nlanes/2; + i += VTraits::vlanes()/2; } return i - i % cn; @@ -179,7 +179,7 @@ template struct MorphRowVec template struct MorphColumnVec { typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; + typedef typename VTraits::lane_type stype; MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const { @@ -189,7 +189,7 @@ template struct MorphColumnVec VecUpdate updateOp; for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); + CV_Assert( ((size_t)_src[i] & (VTraits::vlanes()-1)) == 0 ); const stype** src = (const stype**)_src; stype* dst = (stype*)_dst; @@ -197,58 +197,58 @@ template struct MorphColumnVec for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) + for( i = 0; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes()) { const stype* sptr = src[1] + i; vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); + vtype s1 = vx_load_aligned(sptr + VTraits::vlanes()); + vtype s2 = vx_load_aligned(sptr + 2*VTraits::vlanes()); + vtype s3 = vx_load_aligned(sptr + 3*VTraits::vlanes()); for( k = 2; k < _ksize; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); + s1 = updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes())); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits::vlanes())); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits::vlanes())); } sptr = src[0] + i; v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); + v_store(dst + i + VTraits::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes()))); + v_store(dst + i + 2*VTraits::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits::vlanes()))); + v_store(dst + i + 3*VTraits::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits::vlanes()))); sptr = src[k] + i; v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); + v_store(dst + dststep + i + VTraits::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes()))); + v_store(dst + dststep + i + 2*VTraits::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits::vlanes()))); + v_store(dst + dststep + i + 3*VTraits::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits::vlanes()))); } - if( i <= width - 2*vtype::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { const stype* sptr = src[1] + i; vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s1 = vx_load_aligned(sptr + VTraits::vlanes()); for( k = 2; k < _ksize; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s1 = updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes())); } sptr = src[0] + i; v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + i + VTraits::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes()))); sptr = src[k] + i; v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - i += 2*vtype::nlanes; + v_store(dst + dststep + i + VTraits::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes()))); + i += 2*VTraits::vlanes(); } - if( i <= width - vtype::nlanes ) + if( i <= width - VTraits::vlanes() ) { vtype s0 = vx_load_aligned(src[1] + i); @@ -257,9 +257,9 @@ template struct MorphColumnVec v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); - i += vtype::nlanes; + i += VTraits::vlanes(); } - if( i <= width - vtype::nlanes/2 ) + if( i <= width - VTraits::vlanes()/2 ) { vtype s0 = vx_load_low(src[1] + i); @@ -268,66 +268,66 @@ template struct MorphColumnVec v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); - i += vtype::nlanes/2; + i += VTraits::vlanes()/2; } } for( ; count > 0; count--, dst += dststep, src++ ) { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) + for( i = 0; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes()) { const stype* sptr = src[0] + i; vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); + vtype s1 = vx_load_aligned(sptr + VTraits::vlanes()); + vtype s2 = vx_load_aligned(sptr + 2*VTraits::vlanes()); + vtype s3 = vx_load_aligned(sptr + 3*VTraits::vlanes()); for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); + s1 = updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes())); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits::vlanes())); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits::vlanes())); } v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*vtype::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { const stype* sptr = src[0] + i; vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s1 = vx_load_aligned(sptr + VTraits::vlanes()); for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s1 = updateOp(s1, vx_load_aligned(sptr + VTraits::vlanes())); } v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - vtype::nlanes ) + if( i <= width - VTraits::vlanes() ) { vtype s0 = vx_load_aligned(src[0] + i); for( k = 1; k < _ksize; k++ ) s0 = updateOp(s0, vx_load_aligned(src[k] + i)); v_store(dst + i, s0); - i += vtype::nlanes; + i += VTraits::vlanes(); } - if( i <= width - vtype::nlanes/2 ) + if( i <= width - VTraits::vlanes()/2 ) { vtype s0 = vx_load_low(src[0] + i); for( k = 1; k < _ksize; k++ ) s0 = updateOp(s0, vx_load_low(src[k] + i)); v_store_low(dst + i, s0); - i += vtype::nlanes/2; + i += VTraits::vlanes()/2; } } @@ -341,7 +341,7 @@ template struct MorphColumnVec template struct MorphVec { typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; + typedef typename VTraits::lane_type stype; int operator()(uchar** _src, int nz, uchar* _dst, int width) const { CV_INSTRUMENT_REGION(); @@ -351,56 +351,56 @@ template struct MorphVec int i, k; VecUpdate updateOp; - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) + for( i = 0; i <= width - 4*VTraits::vlanes(); i += 4*VTraits::vlanes() ) { const stype* sptr = src[0] + i; vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - vtype s2 = vx_load(sptr + 2*vtype::nlanes); - vtype s3 = vx_load(sptr + 3*vtype::nlanes); + vtype s1 = vx_load(sptr + VTraits::vlanes()); + vtype s2 = vx_load(sptr + 2*VTraits::vlanes()); + vtype s3 = vx_load(sptr + 3*VTraits::vlanes()); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); + s1 = updateOp(s1, vx_load(sptr + VTraits::vlanes())); + s2 = updateOp(s2, vx_load(sptr + 2*VTraits::vlanes())); + s3 = updateOp(s3, vx_load(sptr + 3*VTraits::vlanes())); } v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); + v_store(dst + i + VTraits::vlanes(), s1); + v_store(dst + i + 2*VTraits::vlanes(), s2); + v_store(dst + i + 3*VTraits::vlanes(), s3); } - if( i <= width - 2*vtype::nlanes ) + if( i <= width - 2*VTraits::vlanes() ) { const stype* sptr = src[0] + i; vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); + vtype s1 = vx_load(sptr + VTraits::vlanes()); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); + s1 = updateOp(s1, vx_load(sptr + VTraits::vlanes())); } v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; + v_store(dst + i + VTraits::vlanes(), s1); + i += 2*VTraits::vlanes(); } - if( i <= width - vtype::nlanes ) + if( i <= width - VTraits::vlanes() ) { vtype s0 = vx_load(src[0] + i); for( k = 1; k < nz; k++ ) s0 = updateOp(s0, vx_load(src[k] + i)); v_store(dst + i, s0); - i += vtype::nlanes; + i += VTraits::vlanes(); } - if( i <= width - vtype::nlanes/2 ) + if( i <= width - VTraits::vlanes()/2 ) { vtype s0 = vx_load_low(src[0] + i); for( k = 1; k < nz; k++ ) s0 = updateOp(s0, vx_load_low(src[k] + i)); v_store_low(dst + i, s0); - i += vtype::nlanes/2; + i += VTraits::vlanes()/2; } return i; } diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp index 4c73910e27..194e5a0862 100644 --- a/modules/imgproc/src/shapedescr.cpp +++ b/modules/imgproc/src/shapedescr.cpp @@ -879,14 +879,14 @@ static Rect pointSetBoundingRect( const Mat& points ) if( npoints == 0 ) return Rect(); -#if CV_SIMD +#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related. const int64_t* pts = points.ptr(); if( !is_float ) { v_int32 minval, maxval; minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y - for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 ) + for( i = 1; i <= npoints - VTraits::vlanes()/2; i+= VTraits::vlanes()/2 ) { v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i)); minval = v_min(ptXY2, minval); @@ -894,22 +894,22 @@ static Rect pointSetBoundingRect( const Mat& points ) } minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); - if( i <= npoints - v_int32::nlanes/4 ) + if( i <= npoints - VTraits::vlanes()/4 ) { v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); minval = v_min(ptXY, minval); maxval = v_max(ptXY, maxval); - i += v_int64::nlanes/2; + i += VTraits::vlanes()/2; } - for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + for(int j = 16; j < VTraits::vlanes(); j*=2) { minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); } - xmin = minval.get0(); - xmax = maxval.get0(); - ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0(); - ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0(); + xmin = v_get0(minval); + xmax = v_get0(maxval); + ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); + ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); #if CV_SIMD_WIDTH > 16 if( i < npoints ) { @@ -921,18 +921,18 @@ static Rect pointSetBoundingRect( const Mat& points ) minval2 = v_min(ptXY, minval2); maxval2 = v_max(ptXY, maxval2); } - xmin = min(xmin, minval2.get0()); - xmax = max(xmax, maxval2.get0()); - ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()); - ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()); + xmin = min(xmin, v_get0(minval2)); + xmax = max(xmax, v_get0(maxval2)); + ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))))); + ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))))); } -#endif +#endif // CV_SIMD } else { v_float32 minval, maxval; minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y - for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 ) + for( i = 1; i <= npoints - VTraits::vlanes()/2; i+= VTraits::vlanes()/2 ) { v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i)); minval = v_min(ptXY2, minval); @@ -940,22 +940,22 @@ static Rect pointSetBoundingRect( const Mat& points ) } minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); - if( i <= npoints - v_float32::nlanes/4 ) + if( i <= npoints - VTraits::vlanes()/4 ) { v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); minval = v_min(ptXY, minval); maxval = v_max(ptXY, maxval); - i += v_float32::nlanes/4; + i += VTraits::vlanes()/4; } - for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + for(int j = 16; j < VTraits::vlanes(); j*=2) { minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); } - xmin = cvFloor(minval.get0()); - xmax = cvFloor(maxval.get0()); - ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0()); - ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0()); + xmin = cvFloor(v_get0(minval)); + xmax = cvFloor(v_get0(maxval)); + ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))))); + ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))))); #if CV_SIMD_WIDTH > 16 if( i < npoints ) { @@ -967,10 +967,10 @@ static Rect pointSetBoundingRect( const Mat& points ) minval2 = v_min(ptXY, minval2); maxval2 = v_max(ptXY, maxval2); } - xmin = min(xmin, cvFloor(minval2.get0())); - xmax = max(xmax, cvFloor(maxval2.get0())); - ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0())); - ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0())); + xmin = min(xmin, cvFloor(v_get0(minval2))); + xmax = max(xmax, cvFloor(v_get0(maxval2))); + ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2)))))); + ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2)))))); } #endif }