From edee922b59d3bc227905e0a2b68f86324a228650 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH] cvtColor YCrCb 2 RGB --- modules/imgproc/src/color.cpp | 170 +++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 5 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 55b915b5bb..f80b80fe97 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -176,10 +176,10 @@ static IppStatus sts = ippInit(); __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \ \ v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ - v_r1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ - v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ - v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ - v_b0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ } @@ -1852,7 +1852,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3) + if (scn == 3 && false) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2321,6 +2321,166 @@ struct YCrCb2RGB_i uint16x4_t v_alpha2; }; +#elif CV_SSE2 + +template <> +struct YCrCb2RGB_i +{ + typedef uchar channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = _mm_set1_epi16((short)coeffs[0]); + v_c1 = _mm_set1_epi16((short)coeffs[1]); + v_c2 = _mm_set1_epi16((short)coeffs[2]); + v_c3 = _mm_set1_epi16((short)coeffs[3]); + v_delta = _mm_set1_epi16(ColorChannel::half()); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_zero = _mm_setzero_si128(); + } + + // 16s x 8 + void process(__m128i v_y, __m128i v_cr, __m128i v_cb, + __m128i & v_r, __m128i & v_g, __m128i & v_b) const + { + v_cr = _mm_sub_epi16(v_cr, v_delta); + v_cb = _mm_sub_epi16(v_cb, v_delta); + + __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero); + + __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3); + __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2); + __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1); + __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0); + + __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3); + __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2); + __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1); + __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0); + + __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), + _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r0 = _mm_add_epi32(v_r0, v_y_p); + v_g0 = _mm_add_epi32(v_g0, v_y_p); + v_b0 = _mm_add_epi32(v_b0, v_y_p); + + v_y_p = _mm_unpackhi_epi16(v_y, v_zero); + + __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), + _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r1 = _mm_add_epi32(v_r1, v_y_p); + v_g1 = _mm_add_epi32(v_g1, v_y_p); + v_b1 = _mm_add_epi32(v_b1, v_y_p); + + v_r = _mm_packs_epi32(v_r0, v_r1); + v_g = _mm_packs_epi32(v_g0, v_g1); + v_b = _mm_packs_epi32(v_b0, v_b1); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (dcn == 3) + { + for ( ; i <= n - 96; i += 96, dst += dcn * 32) + { + __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i)); + __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16)); + __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32)); + __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48)); + __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); + __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); + + _MM_DEINTERLIV_EPI8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1) + + __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; + process(_mm_unpacklo_epi8(v_y0, v_zero), + _mm_unpacklo_epi8(v_cr0, v_zero), + _mm_unpacklo_epi8(v_cb0, v_zero), + v_r_0, v_g_0, v_b_0); + + __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero; + process(_mm_unpackhi_epi8(v_y0, v_zero), + _mm_unpackhi_epi8(v_cr0, v_zero), + _mm_unpackhi_epi8(v_cb0, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1); + + process(_mm_unpacklo_epi8(v_y1, v_zero), + _mm_unpacklo_epi8(v_cr1, v_zero), + _mm_unpacklo_epi8(v_cb1, v_zero), + v_r_0, v_g_0, v_b_0); + + process(_mm_unpackhi_epi8(v_y1, v_zero), + _mm_unpackhi_epi8(v_cr1, v_zero), + _mm_unpackhi_epi8(v_cb1, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1); + + if (bidx == 0) + { + std::swap(v_r0, v_b0); + std::swap(v_r1, v_b1); + } + + _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + _mm_storeu_si128((__m128i *)(dst), v_r0); + _mm_storeu_si128((__m128i *)(dst + 16), v_r1); + _mm_storeu_si128((__m128i *)(dst + 32), v_g0); + _mm_storeu_si128((__m128i *)(dst + 48), v_g1); + _mm_storeu_si128((__m128i *)(dst + 64), v_b0); + _mm_storeu_si128((__m128i *)(dst + 80), v_b1); + } + } + + + for ( ; i < n; i += 3, dst += dcn) + { + uchar Y = src[i]; + uchar Cr = src[i+1]; + uchar Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; + __m128i v_delta, v_alpha, v_zero; +}; + #endif ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////