diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 9569e6127e..263659d302 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -165,7 +165,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load(). // Correspondingly, the wide intrinsics (which are mapped to the "widest" // available instruction set) will get vx_ prefix -// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load()) +// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load()) #if CV_AVX2 #include "opencv2/core/hal/intrin_avx.hpp" @@ -225,14 +225,16 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \ inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \ inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \ + inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \ + inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \ inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \ inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \ -inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); } + inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); } #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \ -inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); } + inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); } #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \ CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \ @@ -327,7 +329,7 @@ template struct V_RegTraits CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256) CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load) inline void vx_cleanup() { v256_cleanup(); } -#elif CV_SIMD128 +#elif CV_SIMD128 || CV_SIMD128_CPP typedef v_uint8x16 v_uint8; typedef v_int8x16 v_int8; typedef v_uint16x8 v_uint16; diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index fc2fd7cee1..c64ff99f75 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -429,6 +429,11 @@ inline v_float16x16 v256_load_f16(const short* ptr) inline v_float16x16 v256_load_f16_aligned(const short* ptr) { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); } +inline v_float16x16 v256_load_f16_low(const short* ptr) +{ return v_float16x16(v256_load_low(ptr).val); } +inline v_float16x16 v256_load_f16_halves(const short* ptr0, const short* ptr1) +{ return v_float16x16(v256_load_halves(ptr0, ptr1).val); } + inline void v_store(short* ptr, const v_float16x16& a) { _mm256_storeu_si256((__m256i*)ptr, a.val); } inline void v_store_aligned(short* ptr, const v_float16x16& a) @@ -841,94 +846,80 @@ OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd) template inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b) { - __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03); - - switch(imm) - { - case 0: return a; - case 32: return b; - case 16: return v_uint8x32(swap); - } + enum {IMM_R = (16 - imm) & 0xFF}; + enum {IMM_R2 = (32 - imm) & 0xFF}; - if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm)); - if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm)); + if (imm == 0) return a; + if (imm == 32) return b; + if (imm > 32) return v_uint8x32(); - return v_uint8x32(); + __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03); + if (imm == 16) return v_uint8x32(swap); + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R)); + return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32 } template inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b) { - __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21); - - switch(imm) - { - case 0: return a; - case 32: return b; - case 16: return v_uint8x32(swap); - } + enum {IMM_L = (imm - 16) & 0xFF}; - if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm)); - if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16)); + if (imm == 0) return a; + if (imm == 32) return b; + if (imm > 32) return v_uint8x32(); - return v_uint8x32(); + __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21); + if (imm == 16) return v_uint8x32(swap); + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm)); + return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L)); } template inline v_uint8x32 v_rotate_left(const v_uint8x32& a) { - v_uint8x32 res; + enum {IMM_L = (imm - 16) & 0xFF}; + enum {IMM_R = (16 - imm) & 0xFF}; + + if (imm == 0) return a; + if (imm > 32) return v_uint8x32(); + // ESAC control[3] ? [127:0] = 0 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0)); - - if (imm == 0) - return a; - if (imm == 16) - res.val = swapz; - else if (imm < 16) - res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm); - else if (imm < 32) - res.val = _mm256_slli_si256(swapz, imm - 16); - else - return v_uint8x32(); - return res; + if (imm == 16) return v_uint8x32(swapz); + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R)); + return v_uint8x32(_mm256_slli_si256(swapz, IMM_L)); } template inline v_uint8x32 v_rotate_right(const v_uint8x32& a) { - v_uint8x32 res; + enum {IMM_L = (imm - 16) & 0xFF}; + + if (imm == 0) return a; + if (imm > 32) return v_uint8x32(); + // ESAC control[3] ? [127:0] = 0 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1)); - - if (imm == 0) - return a; - if (imm == 16) - res.val = swapz; - else if (imm < 16) - res.val = _mm256_alignr_epi8(swapz, a.val, imm); - else if (imm < 32) - res.val = _mm256_srli_si256(swapz, imm - 16); - else - return v_uint8x32(); - return res; -} - -#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \ - template \ - inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \ - { \ - const int w = sizeof(typename _Tpvec::lane_type); \ - v_uint8x32 ret = intrin(v_reinterpret_as_u8(a), \ - v_reinterpret_as_u8(b)); \ - return _Tpvec(cast(ret.val)); \ - } \ - template \ - inline _Tpvec intrin(const _Tpvec& a) \ - { \ - const int w = sizeof(typename _Tpvec::lane_type); \ - v_uint8x32 ret = intrin(v_reinterpret_as_u8(a)); \ - return _Tpvec(cast(ret.val)); \ + if (imm == 16) return v_uint8x32(swapz); + if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm)); + return v_uint8x32(_mm256_srli_si256(swapz, IMM_L)); +} + +#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \ + template \ + inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \ + { \ + enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a), \ + v_reinterpret_as_u8(b)); \ + return _Tpvec(cast(ret.val)); \ + } \ + template \ + inline _Tpvec intrin(const _Tpvec& a) \ + { \ + enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \ + v_uint8x32 ret = intrin(v_reinterpret_as_u8(a)); \ + return _Tpvec(cast(ret.val)); \ } #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index b601e3e820..73ca948e24 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -319,6 +319,9 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a) #endif } +#ifndef vdup_n_f16 + #define vdup_n_f16(v) (float16x4_t){v, v, v, v} +#endif struct v_float16x8 { @@ -893,6 +896,11 @@ inline v_float16x8 v_load_f16(const short* ptr) inline v_float16x8 v_load_f16_aligned(const short* ptr) { return v_float16x8(cv_vld1q_f16(ptr)); } +inline v_float16x8 v_load_f16_low(const short* ptr) +{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); } +inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1) +{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); } + inline void v_store(short* ptr, const v_float16x8& a) { cv_vst1q_f16(ptr, a.val); } inline void v_store_aligned(short* ptr, const v_float16x8& a) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 6e07940042..d1f24d17b5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1330,6 +1330,11 @@ inline v_float16x8 v_load_f16(const short* ptr) inline v_float16x8 v_load_f16_aligned(const short* ptr) { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); } +inline v_float16x8 v_load_f16_low(const short* ptr) +{ return v_float16x8(v_load_low(ptr).val); } +inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1) +{ return v_float16x8(v_load_halves(ptr0, ptr1).val); } + inline void v_store(short* ptr, const v_float16x8& a) { _mm_storeu_si128((__m128i*)ptr, a.val); } inline void v_store_aligned(short* ptr, const v_float16x8& a) diff --git a/modules/core/test/test_intrin.avx2.cpp b/modules/core/test/test_intrin.avx2.cpp new file mode 100644 index 0000000000..9ebfcdf542 --- /dev/null +++ b/modules/core/test/test_intrin.avx2.cpp @@ -0,0 +1,5 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#include "test_precomp.hpp" +#include "test_intrin.simd.hpp" \ No newline at end of file diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 9a1130fe96..6610e332de 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -2,249 +2,101 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "test_precomp.hpp" +#include "test_intrin.simd.hpp" -#include "test_intrin_utils.hpp" - -#define CV_CPU_SIMD_FILENAME "test_intrin_utils.hpp" +#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp" #define CV_CPU_DISPATCH_MODE FP16 #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" - -using namespace cv; +#define CV_CPU_DISPATCH_MODE AVX2 +#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" namespace opencv_test { namespace hal { using namespace CV_CPU_OPTIMIZATION_NAMESPACE; -//============= 8-bit integer ===================================================================== - -TEST(hal_intrin, uint8x16) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_expand_q() - .test_addsub() - .test_addsub_wrap() - .test_cmp() - .test_logic() - .test_min_max() - .test_absdiff() - .test_mask() - .test_popcount() - .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() - .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() - .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() - ; -} +TEST(hal_intrin, uint8x16) +{ test_hal_intrin_uint8(); } -TEST(hal_intrin, int8x16) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_expand_q() - .test_addsub() - .test_addsub_wrap() - .test_cmp() - .test_logic() - .test_min_max() - .test_absdiff() - .test_abs() - .test_mask() - .test_popcount() - .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() - .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() - ; -} +TEST(hal_intrin, int8x16) +{ test_hal_intrin_int8(); } -//============= 16-bit integer ===================================================================== - -TEST(hal_intrin, uint16x8) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_addsub() - .test_addsub_wrap() - .test_mul() - .test_mul_expand() - .test_cmp() - .test_shift<1>() - .test_shift<8>() - .test_logic() - .test_min_max() - .test_absdiff() - .test_reduce() - .test_mask() - .test_popcount() - .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() - .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() - .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() - ; -} +TEST(hal_intrin, uint16x8) +{ test_hal_intrin_uint16(); } -TEST(hal_intrin, int16x8) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_addsub() - .test_addsub_wrap() - .test_mul() - .test_mul_expand() - .test_cmp() - .test_shift<1>() - .test_shift<8>() - .test_dot_prod() - .test_logic() - .test_min_max() - .test_absdiff() - .test_abs() - .test_reduce() - .test_mask() - .test_popcount() - .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() - .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() - ; -} +TEST(hal_intrin, int16x8) +{ test_hal_intrin_int16(); } -//============= 32-bit integer ===================================================================== - -TEST(hal_intrin, uint32x4) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_addsub() - .test_mul() - .test_mul_expand() - .test_cmp() - .test_shift<1>() - .test_shift<8>() - .test_logic() - .test_min_max() - .test_absdiff() - .test_reduce() - .test_mask() - .test_popcount() - .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() - .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() - .test_transpose() - ; -} +TEST(hal_intrin, int32x4) +{ test_hal_intrin_int32(); } -TEST(hal_intrin, int32x4) { - TheTest() - .test_loadstore() - .test_interleave() - .test_expand() - .test_addsub() - .test_mul() - .test_abs() - .test_cmp() - .test_popcount() - .test_shift<1>().test_shift<8>() - .test_logic() - .test_min_max() - .test_absdiff() - .test_reduce() - .test_mask() - .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() - .test_unpack() - .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() - .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() - .test_float_cvt32() - .test_float_cvt64() - .test_transpose() - ; -} +TEST(hal_intrin, uint32x4) +{ test_hal_intrin_uint32(); } -//============= 64-bit integer ===================================================================== - -TEST(hal_intrin, uint64x2) { - TheTest() - .test_loadstore() - .test_addsub() - .test_shift<1>().test_shift<8>() - .test_logic() - .test_extract<0>().test_extract<1>() - .test_rotate<0>().test_rotate<1>() - ; -} +TEST(hal_intrin, uint64x2) +{ test_hal_intrin_uint64(); } -TEST(hal_intrin, int64x2) { - TheTest() - .test_loadstore() - .test_addsub() - .test_shift<1>().test_shift<8>() - .test_logic() - .test_extract<0>().test_extract<1>() - .test_rotate<0>().test_rotate<1>() - ; -} +TEST(hal_intrin, int64x2) +{ test_hal_intrin_int64(); } -//============= Floating point ===================================================================== - -TEST(hal_intrin, float32x4) { - TheTest() - .test_loadstore() - .test_interleave() - .test_interleave_2channel() - .test_addsub() - .test_mul() - .test_div() - .test_cmp() - .test_sqrt_abs() - .test_min_max() - .test_float_absdiff() - .test_reduce() - .test_mask() - .test_unpack() - .test_float_math() - .test_float_cvt64() - .test_matmul() - .test_transpose() - .test_reduce_sum4() - .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() - .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() - ; -} +TEST(hal_intrin, float32x4) +{ test_hal_intrin_float32(); } -#if CV_SIMD128_64F -TEST(hal_intrin, float64x2) { - TheTest() - .test_loadstore() - .test_addsub() - .test_mul() - .test_div() - .test_cmp() - .test_sqrt_abs() - .test_min_max() - .test_float_absdiff() - .test_mask() - .test_unpack() - .test_float_math() - .test_float_cvt32() - .test_extract<0>().test_extract<1>() - .test_rotate<0>().test_rotate<1>() - ; -} -#endif +TEST(hal_intrin, float64x2) +{ test_hal_intrin_float64(); } -TEST(hal_intrin,float16) +TEST(hal_intrin, float16x8) { CV_CPU_CALL_FP16_(test_hal_intrin_float16, ()); throw SkipTestException("Unsupported hardware: FP16 is not available"); } -}} +#define DISPATCH_SIMD_MODES AVX2 +#define DISPATCH_SIMD_NAME "SIMD256" +#define DISPATCH_SIMD(fun) \ + do { \ + CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES); \ + throw SkipTestException( \ + "Unsupported hardware: " \ + DISPATCH_SIMD_NAME \ + " is not available" \ + ); \ + } while(0) + +TEST(hal_intrin256, uint8x32) +{ DISPATCH_SIMD(test_hal_intrin_uint8); } + +TEST(hal_intrin256, int8x32) +{ DISPATCH_SIMD(test_hal_intrin_int8); } + +TEST(hal_intrin256, uint16x16) +{ DISPATCH_SIMD(test_hal_intrin_uint16); } + +TEST(hal_intrin256, int16x16) +{ DISPATCH_SIMD(test_hal_intrin_int16); } + +TEST(hal_intrin256, uint32x8) +{ DISPATCH_SIMD(test_hal_intrin_uint32); } + +TEST(hal_intrin256, int32x8) +{ DISPATCH_SIMD(test_hal_intrin_int32); } + +TEST(hal_intrin256, uint64x4) +{ DISPATCH_SIMD(test_hal_intrin_uint64); } + +TEST(hal_intrin256, int64x4) +{ DISPATCH_SIMD(test_hal_intrin_int64); } + +TEST(hal_intrin256, float32x8) +{ DISPATCH_SIMD(test_hal_intrin_float32); } + +TEST(hal_intrin256, float64x4) +{ DISPATCH_SIMD(test_hal_intrin_float64); } + +TEST(hal_intrin256, float16x16) +{ + if (!CV_CPU_HAS_SUPPORT_FP16) + throw SkipTestException("Unsupported hardware: FP16 is not available"); + DISPATCH_SIMD(test_hal_intrin_float16); +} + +}} // namespace \ No newline at end of file diff --git a/modules/core/test/test_intrin.simd.hpp b/modules/core/test/test_intrin.simd.hpp new file mode 100644 index 0000000000..4e0d3a073f --- /dev/null +++ b/modules/core/test/test_intrin.simd.hpp @@ -0,0 +1,296 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +#include "test_precomp.hpp" +#include "test_intrin_utils.hpp" + +namespace opencv_test { namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void test_hal_intrin_uint8(); +void test_hal_intrin_int8(); +void test_hal_intrin_uint16(); +void test_hal_intrin_int16(); +void test_hal_intrin_uint32(); +void test_hal_intrin_int32(); +void test_hal_intrin_uint64(); +void test_hal_intrin_int64(); +void test_hal_intrin_float32(); +void test_hal_intrin_float64(); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +//============= 8-bit integer ===================================================================== + +void test_hal_intrin_uint8() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_expand_q() + .test_addsub() + .test_addsub_wrap() + .test_cmp() + .test_logic() + .test_min_max() + .test_absdiff() + .test_mask() + .test_popcount() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() + ; + +#if CV_SIMD256 + TheTest() + .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>() + .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>() + .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>() + .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>() + ; +#endif +} + +void test_hal_intrin_int8() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_expand_q() + .test_addsub() + .test_addsub_wrap() + .test_cmp() + .test_logic() + .test_min_max() + .test_absdiff() + .test_abs() + .test_mask() + .test_popcount() + .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() + .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>() + ; +} + +//============= 16-bit integer ===================================================================== + +void test_hal_intrin_uint16() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_addsub_wrap() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_reduce() + .test_mask() + .test_popcount() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + ; +} + +void test_hal_intrin_int16() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_addsub_wrap() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_dot_prod() + .test_logic() + .test_min_max() + .test_absdiff() + .test_abs() + .test_reduce() + .test_mask() + .test_popcount() + .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() + .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>() + ; +} + +//============= 32-bit integer ===================================================================== + +void test_hal_intrin_uint32() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_mul() + .test_mul_expand() + .test_cmp() + .test_shift<1>() + .test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_reduce() + .test_mask() + .test_popcount() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_transpose() + ; +} + +void test_hal_intrin_int32() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_expand() + .test_addsub() + .test_mul() + .test_abs() + .test_cmp() + .test_popcount() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_min_max() + .test_absdiff() + .test_reduce() + .test_mask() + .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() + .test_unpack() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + .test_float_cvt32() + .test_float_cvt64() + .test_transpose() + ; +} + +//============= 64-bit integer ===================================================================== + +void test_hal_intrin_uint64() +{ + TheTest() + .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + ; +} + +void test_hal_intrin_int64() +{ + TheTest() + .test_loadstore() + .test_addsub() + .test_shift<1>().test_shift<8>() + .test_logic() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + ; +} + +//============= Floating point ===================================================================== +void test_hal_intrin_float32() +{ + TheTest() + .test_loadstore() + .test_interleave() + .test_interleave_2channel() + .test_addsub() + .test_mul() + .test_div() + .test_cmp() + .test_sqrt_abs() + .test_min_max() + .test_float_absdiff() + .test_reduce() + .test_mask() + .test_unpack() + .test_float_math() + .test_float_cvt64() + .test_matmul() + .test_transpose() + .test_reduce_sum4() + .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() + .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>() + ; + +#if CV_SIMD256 + TheTest() + .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>() + .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>() + ; +#endif +} + +void test_hal_intrin_float64() +{ +#if CV_SIMD_64F + TheTest() + .test_loadstore() + .test_addsub() + .test_mul() + .test_div() + .test_cmp() + .test_sqrt_abs() + .test_min_max() + .test_float_absdiff() + .test_mask() + .test_unpack() + .test_float_math() + .test_float_cvt32() + .test_extract<0>().test_extract<1>() + .test_rotate<0>().test_rotate<1>() + ; + +#if CV_SIMD256 + TheTest() + .test_extract<2>().test_extract<3>() + .test_rotate<2>().test_rotate<3>() + ; +#endif //CV_SIMD256 + +#endif +} + +#if CV_FP16 && CV_SIMD_WIDTH > 16 +void test_hal_intrin_float16() +{ + TheTest() + .test_loadstore_fp16() + .test_float_cvt_fp16() + ; +} +#endif + +#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +}} //namespace \ No newline at end of file diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 2f8c1cf0b7..5f3175bc6c 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -13,6 +13,27 @@ void test_hal_intrin_float16(); template struct Data; template struct initializer; +template <> struct initializer<64> +{ + template static R init(const Data & d) + { + return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15], + d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31], + d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47], + d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[50], d[51], d[52], d[53], + d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]); + } +}; + +template <> struct initializer<32> +{ + template static R init(const Data & d) + { + return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15], + d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]); + } +}; + template <> struct initializer<16> { template static R init(const Data & d) @@ -125,6 +146,17 @@ template struct Data { return d + R::nlanes / 2; } + LaneType sum(int s, int c) + { + LaneType res = 0; + for (int i = s; i < s + c; ++i) + res += d[i]; + return res; + } + LaneType sum() + { + return sum(0, R::nlanes); + } bool operator==(const Data & other) const { for (int i = 0; i < R::nlanes; ++i) @@ -147,13 +179,12 @@ template struct Data return false; return true; } - LaneType d[R::nlanes]; }; template struct AlignedData { - Data CV_DECL_ALIGNED(16) a; // aligned + Data CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned char dummy; Data u; // unaligned }; @@ -207,22 +238,22 @@ template struct TheTest AlignedData out; // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); + EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH); + EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH); + EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH); // check some initialization methods R r1 = data.a; - R r2 = v_load(data.u.d); - R r3 = v_load_aligned(data.a.d); + R r2 = vx_load(data.u.d); + R r3 = vx_load_aligned(data.a.d); R r4(r2); EXPECT_EQ(data.a[0], r1.get0()); EXPECT_EQ(data.u[0], r2.get0()); EXPECT_EQ(data.a[0], r3.get0()); EXPECT_EQ(data.u[0], r4.get0()); - R r_low = v_load_low((LaneType*)data.u.d); + R r_low = vx_load_low((LaneType*)data.u.d); EXPECT_EQ(data.u[0], r_low.get0()); v_store(out.u.d, r_low); for (int i = 0; i < R::nlanes/2; ++i) @@ -230,7 +261,7 @@ template struct TheTest EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]); } - R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8)); + R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2))); EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0()); v_store(out.u.d, r_low_align8byte); for (int i = 0; i < R::nlanes/2; ++i) @@ -255,7 +286,7 @@ template struct TheTest // check halves load correctness res.clear(); - R r6 = v_load_halves(d.d, d.mid()); + R r6 = vx_load_halves(d.d, d.mid()); v_store(res.d, r6); EXPECT_EQ(d, res); @@ -270,17 +301,17 @@ template struct TheTest } // reinterpret_as - v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a); - v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a); - v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a); - v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a); - v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a); - v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a); - v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); - v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); - v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); -#if CV_SIMD128_64F - v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); + v_uint8 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a); + v_int8 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a); + v_uint16 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a); + v_int16 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a); + v_uint32 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a); + v_int32 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a); + v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a); + v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a); + v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a); +#if CV_SIMD_64F + v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a); #endif return *this; @@ -357,7 +388,7 @@ template struct TheTest Data dataA; R a = dataA; - Data resB = v_load_expand(dataA.d); + Data resB = vx_load_expand(dataA.d); Rx2 c, d; v_expand(a, c, d); @@ -378,7 +409,7 @@ template struct TheTest { typedef typename V_RegTraits::q_reg Rx4; Data data; - Data out = v_load_expand_q(data.d); + Data out = vx_load_expand_q(data.d); const int n = Rx4::nlanes; for (int i = 0; i < n; ++i) EXPECT_EQ(data[i], out[i]); @@ -610,7 +641,13 @@ template struct TheTest TheTest & test_popcount() { - static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33}; + static unsigned popcountTable[] = { + 0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33, + 35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81, + 83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123, + 128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172, + 176, 181, 186, 192, 193 + }; Data dataA; R a = dataA; @@ -918,7 +955,7 @@ template struct TheTest TheTest & test_float_cvt32() { - typedef v_float32x4 Rt; + typedef v_float32 Rt; Data dataA; dataA *= 1.1; R a = dataA; @@ -934,8 +971,8 @@ template struct TheTest TheTest & test_float_cvt64() { -#if CV_SIMD128_64F - typedef v_float64x2 Rt; +#if CV_SIMD_64F + typedef v_float64 Rt; Data dataA; dataA *= 1.1; R a = dataA; @@ -965,23 +1002,29 @@ template struct TheTest R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD; Data res = v_matmul(v, a, b, c, d); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < R::nlanes; i += 4) { - LaneType val = dataV[0] * dataA[i] - + dataV[1] * dataB[i] - + dataV[2] * dataC[i] - + dataV[3] * dataD[i]; - EXPECT_DOUBLE_EQ(val, res[i]); + for (int j = i; j < i + 4; ++j) + { + LaneType val = dataV[i] * dataA[j] + + dataV[i + 1] * dataB[j] + + dataV[i + 2] * dataC[j] + + dataV[i + 3] * dataD[j]; + EXPECT_COMPARE_EQ(val, res[j]); + } } Data resAdd = v_matmuladd(v, a, b, c, d); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < R::nlanes; i += 4) { - LaneType val = dataV[0] * dataA[i] - + dataV[1] * dataB[i] - + dataV[2] * dataC[i] - + dataD[i]; - EXPECT_DOUBLE_EQ(val, resAdd[i]); + for (int j = i; j < i + 4; ++j) + { + LaneType val = dataV[i] * dataA[j] + + dataV[i + 1] * dataB[j] + + dataV[i + 2] * dataC[j] + + dataD[j]; + EXPECT_COMPARE_EQ(val, resAdd[j]); + } } return *this; } @@ -998,30 +1041,36 @@ template struct TheTest e, f, g, h); Data res[4] = {e, f, g, h}; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < R::nlanes; i += 4) { - EXPECT_EQ(dataA[i], res[i][0]); - EXPECT_EQ(dataB[i], res[i][1]); - EXPECT_EQ(dataC[i], res[i][2]); - EXPECT_EQ(dataD[i], res[i][3]); + for (int j = 0; j < 4; ++j) + { + EXPECT_EQ(dataA[i + j], res[j][i]); + EXPECT_EQ(dataB[i + j], res[j][i + 1]); + EXPECT_EQ(dataC[i + j], res[j][i + 2]); + EXPECT_EQ(dataD[i + j], res[j][i + 3]); + } } return *this; } TheTest & test_reduce_sum4() { - R a(0.1f, 0.02f, 0.003f, 0.0004f); - R b(1, 20, 300, 4000); - R c(10, 2, 0.3f, 0.04f); - R d(1, 2, 3, 4); - - R sum = v_reduce_sum4(a, b, c, d); - - Data res = sum; - EXPECT_EQ(0.1234f, res[0]); - EXPECT_EQ(4321.0f, res[1]); - EXPECT_EQ(12.34f, res[2]); - EXPECT_EQ(10.0f, res[3]); + Data dataA, dataB, dataC, dataD; + dataB *= 0.01f; + dataC *= 0.001f; + dataD *= 0.002f; + + R a = dataA, b = dataB, c = dataC, d = dataD; + Data res = v_reduce_sum4(a, b, c, d); + + for (int i = 0; i < R::nlanes; i += 4) + { + EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]); + EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]); + EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]); + EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]); + } return *this; } @@ -1032,14 +1081,14 @@ template struct TheTest AlignedData out; // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); - EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); + EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH); + EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH); + EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH); // check some initialization methods R r1 = data.u; - R r2 = v_load_f16(data.a.d); + R r2 = vx_load_f16(data.a.d); R r3(r2); EXPECT_EQ(data.u[0], r1.get0()); EXPECT_EQ(data.a[0], r2.get0());