diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 0667ae9210..7af6d84f2d 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -164,8 +164,38 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); } -inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); +} + +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -205,8 +235,8 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); } -inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i v_mask = _mm_set1_epi16(0x00ff); @@ -353,6 +383,31 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g #if CV_SSE4_1 +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); +} + inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) { @@ -505,6 +560,26 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); } +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); +} + inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) { diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 5c792f379d..5cb2a15fd4 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -472,6 +472,162 @@ MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); + +#elif CV_SSE2 + +template +struct VMerge2 +{ + VMerge2() : support(false) { } + void operator()(const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge3 +{ + VMerge3() : support(false) { } + void operator()(const T *, const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge4 +{ + VMerge4() : support(false) { } + void operator()(const T *, const T *, const T *, const T *, T *) const { } + + bool support; +}; + +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge2() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + } \ + \ + bool support; \ +} + +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge3() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + } \ + \ + bool support; \ +} + +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +template <> \ +struct VMerge4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge4() \ + { \ + support = true; \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + const data_type * src2, const data_type * src3, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ + reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ + } \ + \ + bool support; \ +} + +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); + +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); + #endif template static void @@ -499,6 +655,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, dst + j); } +#elif CV_SSE2 + if(cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -520,6 +687,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, dst + j); } +#elif CV_SSE2 + if(cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -542,6 +720,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); } +#elif CV_SSE2 + if(cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 574ff16279..19cf1357b6 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2219,7 +2219,7 @@ struct RGB2YCrCb_i __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); - _mm_interleavee_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); + _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); _mm_storeu_si128((__m128i *)(dst + i), v_y_0); _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); @@ -2988,7 +2988,7 @@ struct YCrCb2RGB_i std::swap(v_r1, v_b1); } - _mm_interleavee_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); _mm_storeu_si128((__m128i *)(dst), v_r0); _mm_storeu_si128((__m128i *)(dst + 16), v_r1); @@ -4585,7 +4585,7 @@ struct RGB2HLS_b __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _mm_interleavee_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); _mm_storeu_si128((__m128i *)(dst + j), v_h0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); @@ -5695,7 +5695,7 @@ struct RGB2Luv_b __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _mm_interleavee_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_storeu_si128((__m128i *)(dst + j), v_l0); _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1);