Convert demosiacing with variable number of gradients to HAL - 5.5x faster

pull/15917/head
ChipKerchner 5 years ago
parent 2112aa31e6
commit 1d33335e33
  1. 348
      modules/imgproc/src/demosaicing.cpp

@ -1027,11 +1027,6 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
bayer += bstep*2;
#if CV_SSE2
bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);
#define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))
#endif
for( int y = 2; y < size.height - 4; y++ )
{
uchar* dstrow = dst + dststep*y + 6;
@ -1047,52 +1042,41 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
i = 1;
#if CV_SSE2
if( haveSSE )
#if CV_SIMD128
for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
{
__m128i z = _mm_setzero_si128();
for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
{
__m128i s1, s2, s3, s4, s6, s7, s8, s9;
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);
s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);
s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);
s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);
s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);
s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);
s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);
s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);
__m128i b0, b1, b2, b3, b4, b5, b6;
b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),
_mm_adds_epu16(_mm_absdiff_epu16(s1, s7),
_mm_absdiff_epu16(s3, s9)));
b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1),
_mm_adds_epu16(_mm_absdiff_epu16(s1, s3),
_mm_absdiff_epu16(s7, s9)));
b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);
b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);
_mm_storeu_si128((__m128i*)brow, b0);
_mm_storeu_si128((__m128i*)(brow + N), b1);
_mm_storeu_si128((__m128i*)(brow + N2), b2);
_mm_storeu_si128((__m128i*)(brow + N3), b3);
b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),
_mm_absdiff_epu16(s6, s8)));
b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),
_mm_absdiff_epu16(s4, s8)));
b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));
b6 = _mm_srli_epi16(b6, 1);
_mm_storeu_si128((__m128i*)(brow + N4), b4);
_mm_storeu_si128((__m128i*)(brow + N5), b5);
_mm_storeu_si128((__m128i*)(brow + N6), b6);
}
v_uint16x8 s1, s2, s3, s4, s6, s7, s8, s9;
s1 = v_load_expand(srow-1-bstep);
s2 = v_load_expand(srow-bstep);
s3 = v_load_expand(srow+1-bstep);
s4 = v_load_expand(srow-1);
s6 = v_load_expand(srow+1);
s7 = v_load_expand(srow-1+bstep);
s8 = v_load_expand(srow+bstep);
s9 = v_load_expand(srow+1+bstep);
v_uint16x8 b0, b1, b2, b3, b4, b5, b6;
b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9);
b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9);
b2 = v_absdiff(s3, s7)<<1;
b3 = v_absdiff(s1, s9)<<1;
v_store(brow, b0);
v_store(brow + N, b1);
v_store(brow + N2, b2);
v_store(brow + N3, b3);
b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8);
b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8);
b6 = (s2 + s4 + s6 + s8)>>1;
v_store(brow + N4, b4);
v_store(brow + N5, b5);
v_store(brow + N6, b6);
}
#endif
@ -1122,8 +1106,8 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
bool greenCell = greenCell0;
i = 2;
#if CV_SSE2
int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
#if CV_SIMD128
int limit = greenCell ? std::min(3, N-2) : 2;
#else
int limit = N - 2;
#endif
@ -1290,237 +1274,229 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
greenCell = !greenCell;
}
#if CV_SSE2
if( !haveSSE )
break;
__m128i emask = _mm_set1_epi32(0x0000ffff),
omask = _mm_set1_epi32(0xffff0000),
z = _mm_setzero_si128(),
one = _mm_set1_epi16(1);
__m128 _0_5 = _mm_set1_ps(0.5f);
#if CV_SIMD128
v_uint32x4 emask = v_setall_u32(0x0000ffff), omask = v_setall_u32(0xffff0000);
v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16();
v_float32x4 _0_5 = v_setall_f32(0.5f);
#define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
#define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
#define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
#define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts
#define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
#define v_cvt_s16f32_lo(a) v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
#define v_cvt_s16f32_hi(a) v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a))) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
// process 8 pixels at once
for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
{
//int gradN = brow0[0] + brow1[0];
__m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1));
v_uint16x8 gradN = v_load(brow0) + v_load(brow1);
//int gradS = brow1[0] + brow2[0];
__m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2));
v_uint16x8 gradS = v_load(brow1) + v_load(brow2);
//int gradW = brow1[N-1] + brow1[N];
__m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N)));
v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N);
//int gradE = brow1[N+1] + brow1[N];
__m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N)));
v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N);
//int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
//int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
__m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE));
__m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE));
v_uint16x8 minGrad = v_min(v_min(gradN, gradS), v_min(gradW, gradE));
v_uint16x8 maxGrad = v_max(v_max(gradN, gradS), v_max(gradW, gradE));
__m128i grad0, grad1;
v_uint16x8 grad0, grad1;
//int gradNE = brow0[N4+1] + brow1[N4];
//int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1))));
__m128i gradNE = _mm_merge_epi16(grad0, grad1);
grad0 = v_load(brow0+N4+1) + v_load(brow1+N4);
grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1);
v_uint16x8 gradNE = v_merge_u16(grad0, grad1);
//int gradSW = brow1[N4] + brow2[N4-1];
//int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1))));
__m128i gradSW = _mm_merge_epi16(grad0, grad1);
grad0 = v_load(brow2+N4-1) + v_load(brow1+N4);
grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1);
v_uint16x8 gradSW = v_merge_u16(grad0, grad1);
minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
minGrad = v_min(v_min(minGrad, gradNE), gradSW);
maxGrad = v_max(v_max(maxGrad, gradNE), gradSW);
//int gradNW = brow0[N5-1] + brow1[N5];
//int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1))));
__m128i gradNW = _mm_merge_epi16(grad0, grad1);
grad0 = v_load(brow0+N5-1) + v_load(brow1+N5);
grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1);
v_uint16x8 gradNW = v_merge_u16(grad0, grad1);
//int gradSE = brow1[N5] + brow2[N5+1];
//int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1))));
__m128i gradSE = _mm_merge_epi16(grad0, grad1);
grad0 = v_load(brow2+N5+1) + v_load(brow1+N5);
grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1);
v_uint16x8 gradSE = v_merge_u16(grad0, grad1);
minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
minGrad = v_min(v_min(minGrad, gradNW), gradSE);
maxGrad = v_max(v_max(maxGrad, gradNW), gradSE);
//int T = minGrad + maxGrad/2;
__m128i T = _mm_adds_epi16(_mm_max_epi16(_mm_srli_epi16(maxGrad, 1), one), minGrad);
__m128i RGs = z, GRs = z, Bs = z, ng = z;
__m128i x0 = _mm_loadl_u8_s16(srow, +0 );
__m128i x1 = _mm_loadl_u8_s16(srow, -1 - bstep );
__m128i x2 = _mm_loadl_u8_s16(srow, -1 - bstep*2);
__m128i x3 = _mm_loadl_u8_s16(srow, - bstep );
__m128i x4 = _mm_loadl_u8_s16(srow, +1 - bstep*2);
__m128i x5 = _mm_loadl_u8_s16(srow, +1 - bstep );
__m128i x6 = _mm_loadl_u8_s16(srow, +2 - bstep );
__m128i x7 = _mm_loadl_u8_s16(srow, +1 );
__m128i x8 = _mm_loadl_u8_s16(srow, +2 + bstep );
__m128i x9 = _mm_loadl_u8_s16(srow, +1 + bstep );
__m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2);
__m128i x11 = _mm_loadl_u8_s16(srow, + bstep );
__m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2);
__m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep );
__m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep );
__m128i x15 = _mm_loadl_u8_s16(srow, -1 );
__m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep );
__m128i t0, t1, mask;
v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad;
v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z;
v_uint16x8 x0 = v_load_expand(srow +0);
v_uint16x8 x1 = v_load_expand(srow -1 - bstep);
v_uint16x8 x2 = v_load_expand(srow -1 - bstep*2);
v_uint16x8 x3 = v_load_expand(srow - bstep);
v_uint16x8 x4 = v_load_expand(srow +1 - bstep*2);
v_uint16x8 x5 = v_load_expand(srow +1 - bstep);
v_uint16x8 x6 = v_load_expand(srow +2 - bstep);
v_uint16x8 x7 = v_load_expand(srow +1);
v_uint16x8 x8 = v_load_expand(srow +2 + bstep);
v_uint16x8 x9 = v_load_expand(srow +1 + bstep);
v_uint16x8 x10 = v_load_expand(srow +1 + bstep*2);
v_uint16x8 x11 = v_load_expand(srow + bstep);
v_uint16x8 x12 = v_load_expand(srow -1 + bstep*2);
v_uint16x8 x13 = v_load_expand(srow -1 + bstep);
v_uint16x8 x14 = v_load_expand(srow -2 + bstep);
v_uint16x8 x15 = v_load_expand(srow -1);
v_uint16x8 x16 = v_load_expand(srow -2 - bstep);
v_uint16x8 t0, t1, mask;
// gradN ***********************************************
mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradN)
mask = (T > gradN); // mask = T>gradN
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradN)
t0 = _mm_slli_epi16(x3, 1); // srow[-bstep]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0); // srow[-bstep*2] + srow[0]
t0 = (x3 << 1); // srow[-bstep]*2
t1 = v_load_expand(srow - bstep*2) + x0; // srow[-bstep*2] + srow[0]
// RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
RGs += (t1 & mask);
// GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask));
GRs += (v_merge_u16(t0, x2 + x4) & mask);
// Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask));
Bs += (v_merge_u16(x1 + x5, t0) & mask);
// gradNE **********************************************
mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNE)
mask = (T > gradNE); // mask = T>gradNE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNE)
t0 = _mm_slli_epi16(x5, 1); // srow[-bstep+1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0); // srow[-bstep*2+2] + srow[0]
t0 = (x5 << 1); // srow[-bstep+1]*2
t1 = v_load_expand(srow - bstep*2+2) + x0; // srow[-bstep*2+2] + srow[0]
// RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
RGs += (v_merge_u16(t1, t0) & mask);
// GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask));
GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask);
// Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask));
Bs += (v_merge_u16(t0, x3 + x6) & mask);
// gradE ***********************************************
mask = _mm_cmpgt_epi16(T, gradE); // mask = T>gradE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradE)
mask = (T > gradE); // mask = T>gradE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradE)
t0 = _mm_slli_epi16(x7, 1); // srow[1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0]
t0 = (x7 << 1); // srow[1]*2
t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0]
// RGs += (srow[2] + srow[0]) * (T>gradE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
RGs += (t1 & mask);
// GRs += (srow[1]*2) * (T>gradE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
GRs += (t0 & mask);
// Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask));
Bs += (v_merge_u16(x5 + x9, x6 + x8) & mask);
// gradSE **********************************************
mask = _mm_cmpgt_epi16(T, gradSE); // mask = T>gradSE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSE)
mask = (T > gradSE); // mask = T>gradSE
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSE)
t0 = _mm_slli_epi16(x9, 1); // srow[bstep+1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0]
t0 = (x9 << 1); // srow[bstep+1]*2
t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0]
// RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
RGs += (v_merge_u16(t1, t0) & mask);
// GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask));
GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask);
// Bs += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x9, 1), _mm_adds_epi16(x8,x11)), mask));
Bs += (v_merge_u16((x9 << 1), x8 + x11) & mask);
// gradS ***********************************************
mask = _mm_cmpgt_epi16(T, gradS); // mask = T>gradS
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradS)
mask = (T > gradS); // mask = T>gradS
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradS)
t0 = _mm_slli_epi16(x11, 1); // srow[bstep]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0]
t0 = (x11 << 1); // srow[bstep]*2
t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0]
// RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
RGs += (t1 & mask);
// GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask));
GRs += (v_merge_u16(t0, x10 + x12) & mask);
// Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask));
Bs += (v_merge_u16(x9 + x13, t0) & mask);
// gradSW **********************************************
mask = _mm_cmpgt_epi16(T, gradSW); // mask = T>gradSW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSW)
mask = (T > gradSW); // mask = T>gradSW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradSW)
t0 = _mm_slli_epi16(x13, 1); // srow[bstep-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0]
t0 = (x13 << 1); // srow[bstep-1]*2
t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0]
// RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
RGs += (v_merge_u16(t1, t0) & mask);
// GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask));
GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask);
// Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask));
Bs += (v_merge_u16(t0, x11 + x14) & mask);
// gradW ***********************************************
mask = _mm_cmpgt_epi16(T, gradW); // mask = T>gradW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradW)
mask = (T > gradW); // mask = T>gradW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradW)
t0 = _mm_slli_epi16(x15, 1); // srow[-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0]
t0 = (x15 << 1); // srow[-1]*2
t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0]
// RGs += (srow[-2]+srow[0]) * (T>gradW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
RGs += (t1 & mask);
// GRs += (srow[-1]*2) * (T>gradW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
GRs += (t0 & mask);
// Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask));
Bs += (v_merge_u16(x1 + x13, x14 + x16) & mask);
// gradNW **********************************************
mask = _mm_cmpgt_epi16(T, gradNW); // mask = T>gradNW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNW)
mask = (T > gradNW); // mask = T>gradNW
ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask)); // ng += (T>gradNW)
t0 = _mm_slli_epi16(x1, 1); // srow[-bstep-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0]
t0 = (x1 << 1); // srow[-bstep-1]*2
t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0]
// RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
RGs += (v_merge_u16(t1, t0) & mask);
// GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask));
GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask);
// Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x1, 1),_mm_adds_epi16(x3,x16)), mask));
Bs += (v_merge_u16((x1 << 1), x3 + x16) & mask);
__m128 ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
__m128 ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng);
v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng);
// now interpolate r, g & b
t0 = _mm_subs_epi16(GRs, RGs);
t1 = _mm_subs_epi16(Bs, RGs);
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs));
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) - v_reinterpret_as_s16(RGs));
t0 = _mm_add_epi16(x0, _mm_packs_epi32(
_mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),
_mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));
t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
v_pack(
v_round(v_cvt_s16f32_lo(t0) * ngf0),
v_round(v_cvt_s16f32_hi(t0) * ngf1)));
t1 = _mm_add_epi16(x0, _mm_packs_epi32(
_mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),
_mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));
t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
v_pack(
v_round(v_cvt_s16f32_lo(t1) * ngf0),
v_round(v_cvt_s16f32_hi(t1) * ngf1)));
x1 = _mm_merge_epi16(x0, t0);
x2 = _mm_merge_epi16(t0, x0);
x1 = v_merge_u16(x0, t0);
x2 = v_merge_u16(t0, x0);
uchar R[8], G[8], B[8];
_mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));
_mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));
_mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));
v_store_low(blueIdx ? B : R, v_pack_u(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(z)));
v_store_low(G, v_pack_u(v_reinterpret_as_s16(x2), v_reinterpret_as_s16(z)));
v_store_low(blueIdx ? R : B, v_pack_u(v_reinterpret_as_s16(t1), v_reinterpret_as_s16(z)));
for( int j = 0; j < 8; j++, dstrow += 3 )
{

Loading…
Cancel
Save