diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index af75445e61..6331eab79b 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2987,6 +2987,72 @@ struct YCrCb2RGB_i haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } +#if CV_SSE4_1 + // 16s x 8 + void process(__m128i* v_src, __m128i* v_shuffle, + __m128i* v_coeffs) const + { + __m128i v_ycrcb[3]; + v_ycrcb[0] = _mm_shuffle_epi8(v_src[0], v_shuffle[0]); + v_ycrcb[1] = _mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 8), v_shuffle[0]); + v_ycrcb[2] = _mm_shuffle_epi8(v_src[1], v_shuffle[0]); + + __m128i v_y[3]; + v_y[1] = _mm_shuffle_epi8(v_src[0], v_shuffle[1]); + v_y[2] = _mm_srli_si128(_mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 15), v_shuffle[1]), 1); + v_y[0] = _mm_unpacklo_epi8(v_y[1], v_zero); + v_y[1] = _mm_unpackhi_epi8(v_y[1], v_zero); + v_y[2] = _mm_unpacklo_epi8(v_y[2], v_zero); + + __m128i v_rgb[6]; + v_rgb[0] = _mm_unpacklo_epi8(v_ycrcb[0], v_zero); + v_rgb[1] = _mm_unpackhi_epi8(v_ycrcb[0], v_zero); + v_rgb[2] = _mm_unpacklo_epi8(v_ycrcb[1], v_zero); + v_rgb[3] = _mm_unpackhi_epi8(v_ycrcb[1], v_zero); + v_rgb[4] = _mm_unpacklo_epi8(v_ycrcb[2], v_zero); + v_rgb[5] = _mm_unpackhi_epi8(v_ycrcb[2], v_zero); + + v_rgb[0] = _mm_sub_epi16(v_rgb[0], v_delta); + v_rgb[1] = _mm_sub_epi16(v_rgb[1], v_delta); + v_rgb[2] = _mm_sub_epi16(v_rgb[2], v_delta); + v_rgb[3] = _mm_sub_epi16(v_rgb[3], v_delta); + v_rgb[4] = _mm_sub_epi16(v_rgb[4], v_delta); + v_rgb[5] = _mm_sub_epi16(v_rgb[5], v_delta); + + v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]); + v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[1]); + v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[2]); + v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]); + v_rgb[4] = _mm_madd_epi16(v_rgb[4], v_coeffs[1]); + v_rgb[5] = _mm_madd_epi16(v_rgb[5], v_coeffs[2]); + + v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta2); + v_rgb[1] = _mm_add_epi32(v_rgb[1], v_delta2); + v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta2); + v_rgb[3] = _mm_add_epi32(v_rgb[3], v_delta2); + v_rgb[4] = _mm_add_epi32(v_rgb[4], v_delta2); + v_rgb[5] = _mm_add_epi32(v_rgb[5], v_delta2); + + v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); + v_rgb[1] = _mm_srai_epi32(v_rgb[1], yuv_shift); + v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); + v_rgb[3] = _mm_srai_epi32(v_rgb[3], yuv_shift); + v_rgb[4] = _mm_srai_epi32(v_rgb[4], yuv_shift); + v_rgb[5] = _mm_srai_epi32(v_rgb[5], yuv_shift); + + v_rgb[0] = _mm_packs_epi32(v_rgb[0], v_rgb[1]); + v_rgb[2] = _mm_packs_epi32(v_rgb[2], v_rgb[3]); + v_rgb[4] = _mm_packs_epi32(v_rgb[4], v_rgb[5]); + + v_rgb[0] = _mm_add_epi16(v_rgb[0], v_y[0]); + v_rgb[2] = _mm_add_epi16(v_rgb[2], v_y[1]); + v_rgb[4] = _mm_add_epi16(v_rgb[4], v_y[2]); + + v_src[0] = _mm_packus_epi16(v_rgb[0], v_rgb[2]); + v_src[1] = _mm_packus_epi16(v_rgb[4], v_rgb[4]); + } +#endif // CV_SSE4_1 + // 16s x 8 void process(__m128i v_y, __m128i v_cr, __m128i v_cb, __m128i & v_r, __m128i & v_g, __m128i & v_b) const @@ -3040,6 +3106,91 @@ struct YCrCb2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; +#if CV_SSE4_1 + if (checkHardwareSupport(CV_CPU_SSE4_1) && useSSE) + { + __m128i v_shuffle[2]; + v_shuffle[0] = _mm_set_epi8(0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0); + v_shuffle[1] = _mm_set_epi8(0xf, 0xc, 0xc, 0xc, 0x9, 0x9, 0x9, 0x6, 0x6, 0x6, 0x3, 0x3, 0x3, 0x0, 0x0, 0x0); + __m128i v_coeffs[3]; + v_coeffs[0] = _mm_set_epi16((short)C0, 0, 0, (short)C3, (short)C2, (short)C1, (short)C0, 0); + v_coeffs[1] = _mm_set_epi16((short)C2, (short)C1, (short)C0, 0, 0, (short)C3, (short)C2, (short)C1); + v_coeffs[2] = _mm_set_epi16(0, (short)C3, (short)C2, (short)C1, (short)C0, 0, 0, (short)C3); + + if (dcn == 3) + { + if (bidx == 0) + { + __m128i v_shuffle_dst = _mm_set_epi8(0xf, 0xc, 0xd, 0xe, 0x9, 0xa, 0xb, 0x6, 0x7, 0x8, 0x3, 0x4, 0x5, 0x0, 0x1, 0x2); + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + __m128i v_src[2]; + v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); + v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); + + process(v_src, v_shuffle, v_coeffs); + + __m128i v_dst[2]; + v_dst[0] = _mm_shuffle_epi8(v_src[0], v_shuffle_dst); + v_dst[1] = _mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 15), v_shuffle_dst); + + _mm_storeu_si128((__m128i *)(dst), _mm_alignr_epi8(v_dst[1], _mm_slli_si128(v_dst[0], 1), 1)); + _mm_storel_epi64((__m128i *)(dst + 16), _mm_srli_si128(v_dst[1], 1)); + } + } + else + { + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + __m128i v_src[2]; + v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); + v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); + + process(v_src, v_shuffle, v_coeffs); + + _mm_storeu_si128((__m128i *)(dst), v_src[0]); + _mm_storel_epi64((__m128i *)(dst + 16), v_src[1]); + } + } + } + else + { + if (bidx == 0) + { + __m128i v_shuffle_dst = _mm_set_epi8(0x0, 0xa, 0xb, 0xc, 0x0, 0x7, 0x8, 0x9, 0x0, 0x4, 0x5, 0x6, 0x0, 0x1, 0x2, 0x3); + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + __m128i v_src[2]; + v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); + v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); + + process(v_src, v_shuffle, v_coeffs); + + _mm_storeu_si128((__m128i *)(dst), _mm_shuffle_epi8(_mm_alignr_epi8(v_src[0], v_alpha, 15), v_shuffle_dst)); + _mm_storeu_si128((__m128i *)(dst + 16), _mm_shuffle_epi8(_mm_alignr_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 12), v_alpha, 15), v_shuffle_dst)); + } + } + else + { + __m128i v_shuffle_dst = _mm_set_epi8(0x0, 0xc, 0xb, 0xa, 0x0, 0x9, 0x8, 0x7, 0x0, 0x6, 0x5, 0x4, 0x0, 0x3, 0x2, 0x1); + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + __m128i v_src[2]; + v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); + v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); + + process(v_src, v_shuffle, v_coeffs); + + _mm_storeu_si128((__m128i *)(dst), _mm_shuffle_epi8(_mm_alignr_epi8(v_src[0], v_alpha, 15), v_shuffle_dst)); + _mm_storeu_si128((__m128i *)(dst + 16), _mm_shuffle_epi8(_mm_alignr_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 12), v_alpha, 15), v_shuffle_dst)); + } + } + } + } + else +#endif // CV_SSE4_1 if (haveSIMD && useSSE) { for ( ; i <= n - 96; i += 96, dst += dcn * 32)