From 020b47c41abedcdb38e3990f11fd510f775099ff Mon Sep 17 00:00:00 2001 From: k-shinotsuka Date: Wed, 3 Aug 2016 23:42:24 +0900 Subject: [PATCH] improve RGB2Gray() --- modules/imgproc/src/color.cpp | 120 ++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 57 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index b08c608220..cc88c7c203 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1492,36 +1492,47 @@ struct RGB2Gray if( blueIdx == 0 ) std::swap(coeffs[0], coeffs[2]); - v_cb = _mm_set1_epi16((short)coeffs[0]); - v_cg = _mm_set1_epi16((short)coeffs[1]); - v_cr = _mm_set1_epi16((short)coeffs[2]); v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_zero = _mm_setzero_si128(); haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16s x 8 - void process(__m128i v_b, __m128i v_g, __m128i v_r, + void process(__m128i* v_rgb, __m128i* v_coeffs, __m128i & v_gray) const { - __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr); - __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg); - __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb); - __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr); - __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg); - __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb); + __m128i v_rgb_hi[4]; + v_rgb_hi[0] = _mm_cmplt_epi16(v_rgb[0], v_zero); + v_rgb_hi[1] = _mm_cmplt_epi16(v_rgb[1], v_zero); + v_rgb_hi[2] = _mm_cmplt_epi16(v_rgb[2], v_zero); + v_rgb_hi[3] = _mm_cmplt_epi16(v_rgb[3], v_zero); + + v_rgb_hi[0] = _mm_and_si128(v_rgb_hi[0], v_coeffs[1]); + v_rgb_hi[1] = _mm_and_si128(v_rgb_hi[1], v_coeffs[1]); + v_rgb_hi[2] = _mm_and_si128(v_rgb_hi[2], v_coeffs[1]); + v_rgb_hi[3] = _mm_and_si128(v_rgb_hi[3], v_coeffs[1]); + + v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[1]); + v_rgb_hi[2] = _mm_hadd_epi16(v_rgb_hi[2], v_rgb_hi[3]); + v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[2]); + + v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]); + v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[0]); + v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[0]); + v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]); + + v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]); + v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]); - __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r), - _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); - v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0); - v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift); + v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta); + v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta); - __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r), - _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); - v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1); - v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift); + v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); + v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); - v_gray = _mm_packus_epi32(v_gray0, v_gray1); + v_gray = _mm_packs_epi32(v_rgb[0], v_rgb[2]); + v_gray = _mm_add_epi16(v_gray, v_rgb_hi[0]); } void operator()(const ushort* src, ushort* dst, int n) const @@ -1530,54 +1541,49 @@ struct RGB2Gray if (scn == 3 && haveSIMD) { - for ( ; i <= n - 16; i += 16, src += scn * 16) + __m128i v_coeffs[2]; + v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0); + v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2); + + for ( ; i <= n - 8; i += 8, src += scn * 8) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + __m128i v_src[2]; + v_src[0] = _mm_loadu_si128((__m128i const *)(src)); + v_src[1] = _mm_loadu_si128((__m128i const *)(src + 8)); + v_src[2] = _mm_loadu_si128((__m128i const *)(src + 16)); - _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + __m128i v_rgb[4]; + v_rgb[0] = _mm_slli_si128(v_src[0], 2); + v_rgb[1] = _mm_alignr_epi8(v_src[1], v_src[0], 10); + v_rgb[2] = _mm_alignr_epi8(v_src[2], v_src[1], 6); + v_rgb[3] = _mm_srli_si128(v_src[2], 2); - __m128i v_gray0; - process(v_r0, v_g0, v_b0, - v_gray0); + __m128i v_gray; + process(v_rgb, v_coeffs, + v_gray); - __m128i v_gray1; - process(v_r1, v_g1, v_b1, - v_gray1); - - _mm_storeu_si128((__m128i *)(dst + i), v_gray0); - _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + _mm_storeu_si128((__m128i *)(dst + i), v_gray); } } else if (scn == 4 && haveSIMD) { - for ( ; i <= n - 16; i += 16, src += scn * 16) - { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); - __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + __m128i v_coeffs[2]; + v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0]); + v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2); - _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); - - __m128i v_gray0; - process(v_r0, v_g0, v_b0, - v_gray0); + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + __m128i v_rgb[4]; + v_rgb[0] = _mm_loadu_si128((__m128i const *)(src)); + v_rgb[1] = _mm_loadu_si128((__m128i const *)(src + 8)); + v_rgb[2] = _mm_loadu_si128((__m128i const *)(src + 16)); + v_rgb[3] = _mm_loadu_si128((__m128i const *)(src + 24)); - __m128i v_gray1; - process(v_r1, v_g1, v_b1, - v_gray1); + __m128i v_gray; + process(v_rgb, v_coeffs, + v_gray); - _mm_storeu_si128((__m128i *)(dst + i), v_gray0); - _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + _mm_storeu_si128((__m128i *)(dst + i), v_gray); } } @@ -1586,8 +1592,8 @@ struct RGB2Gray } int srccn, coeffs[3]; - __m128i v_cb, v_cg, v_cr; __m128i v_delta; + __m128i v_zero; bool haveSIMD; };