diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index a14b6d7905..3062023ea7 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -86,6 +86,7 @@ #include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" #include @@ -111,7 +112,7 @@ public: return 0; } - int bayer2RGBA(const T*, int, T*, int, int) const + int bayer2RGBA(const T*, int, T*, int, int, const T) const { return 0; } @@ -122,279 +123,14 @@ public: } }; -#if CV_SSE2 +#if CV_SIMD128 class SIMDBayerInterpolator_8u { public: - SIMDBayerInterpolator_8u() - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } - int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst, int width, int bcoeff, int gcoeff, int rcoeff) const { - if( !use_simd ) - return 0; - - __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2)); - __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2)); - __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2)); - const uchar* bayer_end = bayer + width; - - for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7), - _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7)); - __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2)); - b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1); - - __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7)); - __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7); - g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2))); - g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2); - r0 = _mm_slli_epi16(r0, 3); - - g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y)); - g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y)); - g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y)); - g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y)); - g0 = _mm_srli_epi16(g0, 2); - g1 = _mm_srli_epi16(g1, 2); - g0 = _mm_packus_epi16(g0, g0); - g1 = _mm_packus_epi16(g1, g1); - g0 = _mm_unpacklo_epi8(g0, g1); - _mm_storeu_si128((__m128i*)dst, g0); - } - - return (int)(bayer - (bayer_end - width)); - } - - int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const - { - if( !use_simd ) - return 0; - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - - __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); - __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128(); - __m128i masklo = _mm_set1_epi16(0x00ff); - const uchar* bayer_end = bayer + width; - - for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 ) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo)); - __m128i nextb1 = _mm_srli_si128(b1, 2); - __m128i b0 = _mm_add_epi16(b1, nextb1); - b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); - b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); - // b0 b2 ... b14 b1 b3 ... b15 - b0 = _mm_packus_epi16(b0, b1); - - __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8)); - __m128i g1 = _mm_and_si128(r1, masklo); - g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2))); - g1 = _mm_srli_si128(g1, 2); - g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2); - // g0 g2 ... g14 g1 g3 ... g15 - g0 = _mm_packus_epi16(g0, g1); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); - r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); - // r0 r2 ... r14 r1 r3 ... r15 - r0 = _mm_packus_epi16(r0, r1); - - b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); - b0 = _mm_xor_si128(b0, b1); - r0 = _mm_xor_si128(r0, b1); - - // b1 g1 b3 g3 b5 g5... - b1 = _mm_unpackhi_epi8(b0, g0); - // b0 g0 b2 g2 b4 g4 .... - b0 = _mm_unpacklo_epi8(b0, g0); - - // r1 0 r3 0 r5 0 ... - r1 = _mm_unpackhi_epi8(r0, z); - // r0 0 r2 0 r4 0 ... - r0 = _mm_unpacklo_epi8(r0, z); - - // 0 b0 g0 r0 0 b2 g2 r2 ... - g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); - // 0 b8 g8 r8 0 b10 g10 r10 ... - g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); - - // b1 g1 r1 0 b3 g3 r3 0 ... - r0 = _mm_unpacklo_epi16(b1, r1); - // b9 g9 r9 0 b11 g11 r11 0 ... - r1 = _mm_unpackhi_epi16(b1, r1); - - // 0 b0 g0 r0 b1 g1 r1 0 ... - b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); - // 0 b4 g4 r4 b5 g5 r5 0 ... - b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); - - _mm_storel_epi64((__m128i*)(dst-1+0), b0); - _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8)); - _mm_storel_epi64((__m128i*)(dst-1+6*2), b1); - _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8)); - - // 0 b8 g8 r8 b9 g9 r9 0 ... - g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); - // 0 b12 g12 r12 b13 g13 r13 0 ... - g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); - - _mm_storel_epi64((__m128i*)(dst-1+6*4), g0); - _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8)); - - _mm_storel_epi64((__m128i*)(dst-1+6*6), g1); - } - - return (int)(bayer - (bayer_end - width)); - } - - int bayer2RGBA(const uchar*, int, uchar*, int, int) const - { - return 0; - } - - int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const - { - if (!use_simd) - return 0; - - const uchar* bayer_end = bayer + width; - __m128i masklow = _mm_set1_epi16(0x00ff); - __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); - __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128(); - __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0); - - for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42) - { - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - - __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); - __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); - __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); - - __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow)); - __m128i nextb1 = _mm_srli_si128(b1, 2); - __m128i b0 = _mm_add_epi16(b1, nextb1); - b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); - b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); - // b0 b2 ... b14 b1 b3 ... b15 - b0 = _mm_packus_epi16(b0, b1); - - // vertical sum - __m128i r0g = _mm_srli_epi16(r0, 8); - __m128i r2g = _mm_srli_epi16(r2, 8); - __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1); - // gorizontal sum - __m128i g1 = _mm_and_si128(masklow, r1); - __m128i nextg1 = _mm_srli_si128(g1, 2); - __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1); - - // gradients - __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g)); - __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1)); - __m128i gmask = _mm_cmpgt_epi16(gradg, gradv); - - __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full))); - // g0 g2 ... g14 g1 g3 ... - g0 = _mm_packus_epi16(g0, nextg1); - - r0 = _mm_srli_epi16(r1, 8); - r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); - r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); - // r0 r2 ... r14 r1 r3 ... r15 - r0 = _mm_packus_epi16(r0, r1); - - b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); - b0 = _mm_xor_si128(b0, b1); - r0 = _mm_xor_si128(r0, b1); - - // b1 g1 b3 g3 b5 g5... - b1 = _mm_unpackhi_epi8(b0, g0); - // b0 g0 b2 g2 b4 g4 .... - b0 = _mm_unpacklo_epi8(b0, g0); - - // r1 0 r3 0 r5 0 ... - r1 = _mm_unpackhi_epi8(r0, z); - // r0 0 r2 0 r4 0 ... - r0 = _mm_unpacklo_epi8(r0, z); - - // 0 b0 g0 r0 0 b2 g2 r2 ... - g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); - // 0 b8 g8 r8 0 b10 g10 r10 ... - g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); - - // b1 g1 r1 0 b3 g3 r3 0 ... - r0 = _mm_unpacklo_epi16(b1, r1); - // b9 g9 r9 0 b11 g11 r11 0 ... - r1 = _mm_unpackhi_epi16(b1, r1); - - // 0 b0 g0 r0 b1 g1 r1 0 ... - b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); - // 0 b4 g4 r4 b5 g5 r5 0 ... - b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); - - _mm_storel_epi64((__m128i*)(dst+0), b0); - _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8)); - _mm_storel_epi64((__m128i*)(dst+6*2), b1); - _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8)); - - // 0 b8 g8 r8 b9 g9 r9 0 ... - g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); - // 0 b12 g12 r12 b13 g13 r13 0 ... - g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); - - _mm_storel_epi64((__m128i*)(dst+6*4), g0); - _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8)); - - _mm_storel_epi64((__m128i*)(dst+6*6), g1); - } - - return int(bayer - (bayer_end - width)); - } - - bool use_simd; -}; -#elif CV_NEON -class SIMDBayerInterpolator_8u -{ -public: - SIMDBayerInterpolator_8u() - { - } - - int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst, - int width, int bcoeff, int gcoeff, int rcoeff) const - { - /* - B G B G | B G B G | B G B G | B G B G - G R G R | G R G R | G R G R | G R G R - B G B G | B G B G | B G B G | B G B G - */ - +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); const uchar* bayer_end = bayer + width; @@ -440,6 +176,40 @@ public: vst1_u8(dst, p.val[0]); vst1_u8(dst + 8, p.val[1]); } +#else + v_uint16x8 _b2y = v_setall_u16((ushort)(rcoeff*2)); + v_uint16x8 _g2y = v_setall_u16((ushort)(gcoeff*2)); + v_uint16x8 _r2y = v_setall_u16((ushort)(bcoeff*2)); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7); + v_uint16x8 b0 = v_rotate_right<1>(b1) + b1; + b1 = v_rotate_right<1>(b1) << 1; + + v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7); + v_uint16x8 g1 = (r1 << 8) >> 7; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1) << 2; + + r0 = r1 >> 8; + r1 = (v_rotate_right<1>(r0) + r0) << 2; + r0 = r0 << 3; + + g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2; + g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2; + v_uint8x16 pack_lo, pack_hi; + v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)), + v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)), + pack_lo, pack_hi); + v_store(dst, pack_lo); + } +#endif return (int)(bayer - (bayer_end - width)); } @@ -451,6 +221,8 @@ public: G R G R | G R G R | G R G R | G R G R B G B G | B G B G | B G B G | B G B G */ + +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); uint8x16x3_t pix; const uchar* bayer_end = bayer + width; @@ -484,21 +256,109 @@ public: vst3q_u8(dst-1, pix); } +#else + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0); + v_uint16x8 masklo = v_setall_u16(0x00ff); + v_uint8x16 z = v_setzero_u8(); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); + v_uint16x8 g1 = r1 & masklo; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1); + g0 = (g0 + delta2) >> 2; + // g0 g2 ... g14 g1 g3 ... g15 + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 0 r3 0 r5 0 ... + v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 0 r2 0 r4 0 ... + r0 = v_reinterpret_as_u16(pack_lo); + + // 0 b0 g0 r0 0 b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0))); + // 0 b8 g8 r8 0 b10 g10 r10 ... + g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1))); + + // b1 g1 r1 0 b3 g3 r3 0 ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 0 b11 g11 r11 0 ... + + // 0 b0 g0 r0 b1 g1 r1 0 ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b4 g4 r4 b5 g5 r5 0 ... + b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst-1+0, v_reinterpret_as_u8(b0)); + v_store_high(dst-1+6*1, v_reinterpret_as_u8(b0)); + v_store_low(dst-1+6*2, v_reinterpret_as_u8(b1)); + v_store_high(dst-1+6*3, v_reinterpret_as_u8(b1)); + + // 0 b8 g8 r8 b9 g9 r9 0 ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b12 g12 r12 b13 g13 r13 0 ... + g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst-1+6*4, v_reinterpret_as_u8(g0)); + v_store_high(dst-1+6*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst-1+6*6, v_reinterpret_as_u8(g1)); + } +#endif return (int)(bayer - (bayer_end - width)); } - int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue, const uchar alpha) const { /* B G B G | B G B G | B G B G | B G B G G R G R | G R G R | G R G R | G R G R B G B G | B G B G | B G B G | B G B G */ + +#if CV_NEON uint16x8_t masklo = vdupq_n_u16(255); uint8x16x4_t pix; const uchar* bayer_end = bayer + width; - pix.val[3] = vdupq_n_u8(255); + pix.val[3] = vdupq_n_u8(alpha); for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 ) { @@ -529,13 +389,198 @@ public: vst4q_u8(dst-1, pix); } +#else + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0); + v_uint16x8 masklo = v_setall_u16(0x00ff); + v_uint8x16 a = v_setall_u8(alpha); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 ) + { + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8); + v_uint16x8 g1 = r1 & masklo; + g0 += v_rotate_right<1>(g1) + g1; + g1 = v_rotate_right<1>(g1); + g0 = (g0 + delta2) >> 2; + // g0 g2 ... g14 g1 g3 ... g15 + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 a r3 a r5 a ... + v_zip(v_reinterpret_as_u8(r0), a, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 a r2 a r4 a ... + r0 = v_reinterpret_as_u16(pack_lo); + + // a b0 g0 r0 a b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + // a b8 g8 r8 a b10 g10 r10 ... + + // b1 g1 r1 a b3 g3 r3 a ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 a b11 g11 r11 a ... + + // a b0 g0 r0 b1 g1 r1 a ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(pack32_lo); + // a b4 g4 r4 b5 g5 r5 a ... + b1 = v_reinterpret_as_u16(pack32_hi); + + v_store_low(dst-1+0, v_reinterpret_as_u8(b0)); + v_store_high(dst-1+8*1, v_reinterpret_as_u8(b0)); + v_store_low(dst-1+8*2, v_reinterpret_as_u8(b1)); + v_store_high(dst-1+8*3, v_reinterpret_as_u8(b1)); + + // a b8 g8 r8 b9 g9 r9 a ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(pack32_lo); + // a b12 g12 r12 b13 g13 r13 a ... + g1 = v_reinterpret_as_u16(pack32_hi); + + v_store_low(dst-1+8*4, v_reinterpret_as_u8(g0)); + v_store_high(dst-1+8*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst-1+8*6, v_reinterpret_as_u8(g1)); + } +#endif return (int)(bayer - (bayer_end - width)); } - int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const + int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const { - return 0; + const uchar* bayer_end = bayer + width; + v_uint16x8 masklow = v_setall_u16(0x00ff); + v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2); + v_uint16x8 full = v_setall_u16((ushort)(-1)); + v_uint8x16 z = v_setzero_u8(); + v_uint16x8 mask = v_setall_u16(blue > 0 ? (ushort)(-1) : 0); + + for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42) + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + + v_uint16x8 r0 = v_load((ushort*)bayer); + v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step)); + v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2)); + + v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow); + v_uint16x8 nextb1 = v_rotate_right<1>(b1); + v_uint16x8 b0 = b1 + nextb1; + b1 = (nextb1 + delta1) >> 1; + b0 = (b0 + delta2) >> 2; + // b0 b2 ... b14 b1 b3 ... b15 + b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1))); + + // vertical sum + v_uint16x8 r0g = r0 >> 8; + v_uint16x8 r2g = r2 >> 8; + v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1; + // horizontal sum + v_uint16x8 g1 = r1 & masklow; + v_uint16x8 nextg1 = v_rotate_right<1>(g1); + v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1; + + // gradients + v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g); + v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1); + v_uint16x8 gmask = gradg > gradv; + v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full)); + // g0 g2 ... g14 g1 g3 ... + g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1))); + + r0 = r1 >> 8; + r1 = v_rotate_right<1>(r0) + r0; + r1 = (r1 + delta1) >> 1; + // r0 r2 ... r14 r1 r3 ... r15 + r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1))); + + b1 = (b0 ^ r0) & mask; + b0 = b0 ^ b1; + r0 = r0 ^ b1; + + // b1 g1 b3 g3 b5 g5... + v_uint8x16 pack_lo, pack_hi; + v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi); + b1 = v_reinterpret_as_u16(pack_hi); + // b0 g0 b2 g2 b4 g4 .... + b0 = v_reinterpret_as_u16(pack_lo); + + // r1 0 r3 0 r5 0 ... + v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi); + r1 = v_reinterpret_as_u16(pack_hi); + // r0 0 r2 0 r4 0 ... + r0 = v_reinterpret_as_u16(pack_lo); + + // 0 b0 g0 r0 0 b2 g2 r2 ... + v_zip(b0, r0, g0, g1); + g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0))); + // 0 b8 g8 r8 0 b10 g10 r10 ... + g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1))); + + // b1 g1 r1 0 b3 g3 r3 0 ... + v_zip(b1, r1, r0, r1); + // b9 g9 r9 0 b11 g11 r11 0 ... + + // 0 b0 g0 r0 b1 g1 r1 0 ... + v_uint32x4 pack32_lo, pack32_hi; + v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi); + b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b4 g4 r4 b5 g5 r5 0 ... + b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst+0, v_reinterpret_as_u8(b0)); + v_store_high(dst+6*1, v_reinterpret_as_u8(b0)); + v_store_low(dst+6*2, v_reinterpret_as_u8(b1)); + v_store_high(dst+6*3, v_reinterpret_as_u8(b1)); + + // 0 b8 g8 r8 b9 g9 r9 0 ... + v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi); + g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo))); + // 0 b12 g12 r12 b13 g13 r13 0 ... + g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi))); + + v_store_low(dst+6*4, v_reinterpret_as_u8(g0)); + v_store_high(dst+6*5, v_reinterpret_as_u8(g0)); + + v_store_low(dst+6*6, v_reinterpret_as_u8(g1)); + } + + return int(bayer - (bayer_end - width)); } }; #else @@ -775,7 +820,7 @@ public: // simd optimization only for dcn == 3 int delta = dcn == 4 ? - vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) : + vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue, alpha) : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue); bayer += delta; dst += delta*dcn;