diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 1599bf3d91..30619a5af0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -549,6 +549,13 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x) } #endif +#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ +inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); } + +OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8) +OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16) +OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32) + inline v_float32x4 v_abs(v_float32x4 x) { return v_float32x4(vabsq_f32(x.val)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 674d985507..9f4ec66ae3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -739,6 +739,18 @@ inline v_float64x2 v_invsqrt(const v_float64x2& x) return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); } +#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \ +inline _Tpuvec v_abs(const _Tpsvec& x) \ +{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); } + +OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8) +OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16) +inline v_uint32x4 v_abs(const v_int32x4& x) +{ + __m128i s = _mm_srli_epi32(x.val, 31); + __m128i f = _mm_srai_epi32(x.val, 31); + return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s)); +} inline v_float32x4 v_abs(const v_float32x4& x) { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); } inline v_float64x2 v_abs(const v_float64x2& x) diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 7ebd8e979c..a5e0c327e3 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -277,6 +277,24 @@ template struct TheTest return *this; } + TheTest & test_abs() + { + typedef typename V_RegTrait128::u_reg Ru; + typedef typename Ru::lane_type u_type; + Data dataA, dataB(10); + R a = dataA, b = dataB; + a = a - b; + + Data resC = v_abs(a); + + for (int i = 0; i < Ru::nlanes; ++i) + { + EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]); + } + + return *this; + } + template TheTest & test_shift() { @@ -799,6 +817,7 @@ TEST(hal_intrin, int8x16) { .test_logic() .test_min_max() .test_absdiff() + .test_abs() .test_mask() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_unpack() @@ -847,6 +866,7 @@ TEST(hal_intrin, int16x8) { .test_logic() .test_min_max() .test_absdiff() + .test_abs() .test_mask() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_unpack() @@ -886,6 +906,7 @@ TEST(hal_intrin, int32x4) { .test_expand() .test_addsub() .test_mul() + .test_abs() .test_cmp() .test_shift<1>().test_shift<8>() .test_logic() diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 22ec1f39c8..19fc13b380 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" #include #ifdef _MSC_VER @@ -299,8 +300,8 @@ public: void operator()(const Range &boundaries) const { -#if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#if CV_SIMD128 + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif const int type = src.type(), cn = CV_MAT_CN(type); @@ -409,39 +410,28 @@ public: if (!L2gradient) { int j = 0, width = src.cols * cn; -#if CV_SSE2 - if (haveSSE2) +#if CV_SIMD128 + if (haveSIMD) { - __m128i v_zero = _mm_setzero_si128(); for ( ; j <= width - 8; j += 8) { - __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); - __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); + v_int16x8 v_dx = v_load((const short *)(_dx + j)); + v_int16x8 v_dy = v_load((const short *)(_dy + j)); - __m128i v_dx_abs = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx)); - __m128i v_dy_abs = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy)); + v_dx = v_reinterpret_as_s16(v_abs(v_dx)); + v_dy = v_reinterpret_as_s16(v_abs(v_dy)); - __m128i v_dx_ml = _mm_unpacklo_epi16(v_dx_abs, v_zero); - __m128i v_dy_ml = _mm_unpacklo_epi16(v_dy_abs, v_zero); - __m128i v_dx_mh = _mm_unpackhi_epi16(v_dx_abs, v_zero); - __m128i v_dy_mh = _mm_unpackhi_epi16(v_dy_abs, v_zero); + v_int32x4 v_dx_ml; + v_int32x4 v_dy_ml; + v_int32x4 v_dx_mh; + v_int32x4 v_dy_mh; + v_expand(v_dx, v_dx_ml, v_dx_mh); + v_expand(v_dy, v_dy_ml, v_dy_mh); - __m128i v_norm_ml = _mm_add_epi32(v_dx_ml, v_dy_ml); - __m128i v_norm_mh = _mm_add_epi32(v_dx_mh, v_dy_mh); - - _mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); - _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); + v_store((int *)(_norm + j), v_dx_ml + v_dy_ml); + v_store((int *)(_norm + j + 4), v_dx_mh + v_dy_mh); } } -#elif CV_NEON - for ( ; j <= width - 8; j += 8) - { - int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), - vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); - vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), - vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); - } #endif for ( ; j < width; ++j) _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); @@ -449,36 +439,23 @@ public: else { int j = 0, width = src.cols * cn; -#if CV_SSE2 - if (haveSSE2) +#if CV_SIMD128 + if (haveSIMD) { - for ( ; j <= width - 8; j += 8) + for ( ; j <= width - 8; j += 8) { - __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); - __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); + v_int16x8 v_dx = v_load((const short*)(_dx + j)); + v_int16x8 v_dy = v_load((const short*)(_dy + j)); - __m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy); - __m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy); + v_int32x4 v_dxp_low, v_dxp_high; + v_int32x4 v_dyp_low, v_dyp_high; + v_expand(v_dx, v_dxp_low, v_dxp_high); + v_expand(v_dy, v_dyp_low, v_dyp_high); - __m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml); - __m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh); - - _mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); - _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); + v_store((int *)(_norm + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); + v_store((int *)(_norm + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } } -#elif CV_NEON - for ( ; j <= width - 8; j += 8) - { - int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); - int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); - vst1q_s32(_norm + j, v_dst); - - v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); - v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); - vst1q_s32(_norm + j + 4, v_dst); - } #endif for ( ; j < width; ++j) _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; @@ -529,30 +506,31 @@ public: const int TG22 = (int)(0.4142135623730950488016887242097*(1 << CANNY_SHIFT) + 0.5); int prev_flag = 0, j = 0; -#if CV_SSE2 - if (checkHardwareSupport(CPU_SSE2)) +#if CV_SIMD128 + if (haveSIMD) { - __m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1); + v_int32x4 v_low = v_setall_s32(low); + v_int8x16 v_one = v_setall_s8(1); for (; j <= src.cols - 16; j += 16) { - __m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j)); - __m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4)); - __m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8)); - __m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12)); + v_int32x4 v_m1 = v_load((const int*)(_mag + j)); + v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4)); + v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8)); + v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12)); - _mm_storeu_si128((__m128i*)(_map + j), v_one); + v_store((signed char*)(_map + j), v_one); - __m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low); - __m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low); - __m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low); - __m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low); + v_int32x4 v_cmp1 = v_m1 > v_low; + v_int32x4 v_cmp2 = v_m2 > v_low; + v_int32x4 v_cmp3 = v_m3 > v_low; + v_int32x4 v_cmp4 = v_m4 > v_low; - v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2); - v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4); + v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); + v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); - v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2); - unsigned int mask = _mm_movemask_epi8(v_cmp1); + v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); + unsigned int mask = v_signmask(v_cmp); if (mask) { @@ -730,54 +708,57 @@ public: const uchar* pmap = map + mapstep + 1 + (ptrdiff_t)(mapstep * boundaries.start); uchar* pdst = dst.ptr() + (ptrdiff_t)(dst.step * boundaries.start); -#if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#if CV_SIMD128 + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif for (int i = boundaries.start; i < boundaries.end; i++, pmap += mapstep, pdst += dst.step) { int j = 0; -#if CV_SSE2 - if(haveSSE2) { - const __m128i v_zero = _mm_setzero_si128(); +#if CV_SIMD128 + if(haveSIMD) { + const v_int8x16 v_zero = v_setzero_s8(); for(; j <= dst.cols - 32; j += 32) { - __m128i v_pmap1 = _mm_loadu_si128((const __m128i*)(pmap + j)); - __m128i v_pmap2 = _mm_loadu_si128((const __m128i*)(pmap + j + 16)); + v_uint8x16 v_pmap1 = v_load((const unsigned char*)(pmap + j)); + v_uint8x16 v_pmap2 = v_load((const unsigned char*)(pmap + j + 16)); - __m128i v_pmaplo1 = _mm_unpacklo_epi8(v_pmap1, v_zero); - __m128i v_pmaphi1 = _mm_unpackhi_epi8(v_pmap1, v_zero); - __m128i v_pmaplo2 = _mm_unpacklo_epi8(v_pmap2, v_zero); - __m128i v_pmaphi2 = _mm_unpackhi_epi8(v_pmap2, v_zero); + v_uint16x8 v_pmaplo1; + v_uint16x8 v_pmaphi1; + v_uint16x8 v_pmaplo2; + v_uint16x8 v_pmaphi2; + v_expand(v_pmap1, v_pmaplo1, v_pmaphi1); + v_expand(v_pmap2, v_pmaplo2, v_pmaphi2); - v_pmaplo1 = _mm_srli_epi16(v_pmaplo1, 1); - v_pmaphi1 = _mm_srli_epi16(v_pmaphi1, 1); - v_pmaplo2 = _mm_srli_epi16(v_pmaplo2, 1); - v_pmaphi2 = _mm_srli_epi16(v_pmaphi2, 1); + v_pmaplo1 = v_pmaplo1 >> 1; + v_pmaphi1 = v_pmaphi1 >> 1; + v_pmaplo2 = v_pmaplo2 >> 1; + v_pmaphi2 = v_pmaphi2 >> 1; - v_pmap1 = _mm_packus_epi16(v_pmaplo1, v_pmaphi1); - v_pmap2 = _mm_packus_epi16(v_pmaplo2, v_pmaphi2); + v_pmap1 = v_pack(v_pmaplo1, v_pmaphi1); + v_pmap2 = v_pack(v_pmaplo2, v_pmaphi2); - v_pmap1 = _mm_sub_epi8(v_zero, v_pmap1); - v_pmap2 = _mm_sub_epi8(v_zero, v_pmap2); + v_pmap1 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap1)); + v_pmap2 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap2)); - _mm_storeu_si128((__m128i*)(pdst + j), v_pmap1); - _mm_storeu_si128((__m128i*)(pdst + j + 16), v_pmap2); + v_store((pdst + j), v_pmap1); + v_store((pdst + j + 16), v_pmap2); } for(; j <= dst.cols - 16; j += 16) { - __m128i v_pmap = _mm_loadu_si128((const __m128i*)(pmap + j)); + v_uint8x16 v_pmap = v_load((const unsigned char*)(pmap + j)); - __m128i v_pmaplo = _mm_unpacklo_epi8(v_pmap, v_zero); - __m128i v_pmaphi = _mm_unpackhi_epi8(v_pmap, v_zero); + v_uint16x8 v_pmaplo; + v_uint16x8 v_pmaphi; + v_expand(v_pmap, v_pmaplo, v_pmaphi); - v_pmaplo = _mm_srli_epi16(v_pmaplo, 1); - v_pmaphi = _mm_srli_epi16(v_pmaphi, 1); + v_pmaplo = v_pmaplo >> 1; + v_pmaphi = v_pmaphi >> 1; - v_pmap = _mm_packus_epi16(v_pmaplo, v_pmaphi); - v_pmap = _mm_sub_epi8(v_zero, v_pmap); + v_pmap = v_pack(v_pmaplo, v_pmaphi); + v_pmap = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap)); - _mm_storeu_si128((__m128i*)(pdst + j), v_pmap); + v_store((pdst + j), v_pmap); } } #endif @@ -980,8 +961,8 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, #define CANNY_PUSH(d) *(d) = uchar(2), *stack_top++ = (d) #define CANNY_POP(d) (d) = *--stack_top -#if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#if CV_SIMD128 + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif // calculate magnitude and angle of gradient, perform non-maxima suppression. @@ -1000,33 +981,27 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, if (!L2gradient) { int j = 0, width = cols * cn; -#if CV_SSE2 - if (haveSSE2) +#if CV_SIMD128 + if (haveSIMD) { - __m128i v_zero = _mm_setzero_si128(); for ( ; j <= width - 8; j += 8) { - __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); - __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); - v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx)); - v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy)); + v_int16x8 v_dx = v_load((const short*)(_dx + j)); + v_int16x8 v_dy = v_load((const short*)(_dy + j)); + + v_int32x4 v_dx0, v_dx1, v_dy0, v_dy1; + v_expand(v_dx, v_dx0, v_dx1); + v_expand(v_dy, v_dy0, v_dy1); - __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero)); - _mm_storeu_si128((__m128i *)(_norm + j), v_norm); + v_dx0 = v_reinterpret_as_s32(v_abs(v_dx0)); + v_dx1 = v_reinterpret_as_s32(v_abs(v_dx1)); + v_dy0 = v_reinterpret_as_s32(v_abs(v_dy0)); + v_dy1 = v_reinterpret_as_s32(v_abs(v_dy1)); - v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero)); - _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm); + v_store(_norm + j, v_dx0 + v_dy0); + v_store(_norm + j + 4, v_dx1 + v_dy1); } } -#elif CV_NEON - for ( ; j <= width - 8; j += 8) - { - int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), - vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); - vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), - vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); - } #endif for ( ; j < width; ++j) _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); @@ -1034,34 +1009,24 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, else { int j = 0, width = cols * cn; -#if CV_SSE2 - if (haveSSE2) +#if CV_SIMD128 + if (haveSIMD) { for ( ; j <= width - 8; j += 8) { - __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); - __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); - - __m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy); - __m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy); - __m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml); - __m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh); - _mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); - _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); + v_int16x8 v_dx = v_load((const short*)(_dx + j)); + v_int16x8 v_dy = v_load((const short*)(_dy + j)); + + v_int16x8 v_dx_dy0, v_dx_dy1; + v_zip(v_dx, v_dy, v_dx_dy0, v_dx_dy1); + + v_int32x4 v_dst0 = v_dotprod(v_dx_dy0, v_dx_dy0); + v_int32x4 v_dst1 = v_dotprod(v_dx_dy1, v_dx_dy1); + + v_store(_norm + j, v_dst0); + v_store(_norm + j + 4, v_dst1); } } -#elif CV_NEON - for ( ; j <= width - 8; j += 8) - { - int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); - int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); - vst1q_s32(_norm + j, v_dst); - - v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); - v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); - vst1q_s32(_norm + j + 4, v_dst); - } #endif for ( ; j < width; ++j) _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; @@ -1112,30 +1077,31 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, const int TG22 = (int)(0.4142135623730950488016887242097*(1< v_low; + v_int32x4 v_cmp2 = v_m2 > v_low; + v_int32x4 v_cmp3 = v_m3 > v_low; + v_int32x4 v_cmp4 = v_m4 > v_low; - v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2); - v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4); + v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); + v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); - v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2); - unsigned int mask = _mm_movemask_epi8(v_cmp1); + v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); + unsigned int mask = v_signmask(v_cmp); if (mask) {