|
|
|
@ -90,28 +90,27 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste |
|
|
|
|
const uchar* src = (const uchar*)_src; |
|
|
|
|
uchar* dst = (uchar*)_dst; |
|
|
|
|
int x = 0; |
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
if(USE_SSE4_2)//
|
|
|
|
|
{ |
|
|
|
|
__m128i zero = _mm_setzero_si128 (); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); |
|
|
|
|
__m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); |
|
|
|
|
__m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); |
|
|
|
|
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero); |
|
|
|
|
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), rDst); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x16_t v_one = vdupq_n_u8(1); |
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
uint8x16_t v_mask = vcgeq_u8(vld1q_u8(mask + x), v_one); |
|
|
|
|
uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x); |
|
|
|
|
vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst)); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if( hasSIMD128() |
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
&& USE_SSE4_2 |
|
|
|
|
#endif |
|
|
|
|
) { |
|
|
|
|
v_uint8x16 v_zero = v_setzero_u8(); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
v_uint8x16 v_src = v_load(src + x), |
|
|
|
|
v_dst = v_load(dst + x), |
|
|
|
|
v_nmask = v_load(mask + x) == v_zero; |
|
|
|
|
|
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val)); |
|
|
|
|
#else |
|
|
|
|
v_dst = v_select(v_nmask, v_dst, v_src); |
|
|
|
|
#endif |
|
|
|
|
v_store(dst + x, v_dst); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
@ -130,31 +129,33 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst |
|
|
|
|
const ushort* src = (const ushort*)_src; |
|
|
|
|
ushort* dst = (ushort*)_dst; |
|
|
|
|
int x = 0; |
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
if(USE_SSE4_2)//
|
|
|
|
|
{ |
|
|
|
|
__m128i zero = _mm_setzero_si128 (); |
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if( hasSIMD128() |
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
&& USE_SSE4_2 |
|
|
|
|
#endif |
|
|
|
|
) { |
|
|
|
|
v_uint8x16 v_zero = v_setzero_u8(); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x)); |
|
|
|
|
__m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x)); |
|
|
|
|
_mask = _mm_unpacklo_epi8(_mask, _mask); |
|
|
|
|
__m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x)); |
|
|
|
|
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero); |
|
|
|
|
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), rDst); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x8_t v_one = vdup_n_u8(1); |
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
uint8x8_t v_mask = vcge_u8(vld1_u8(mask + x), v_one); |
|
|
|
|
uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask); |
|
|
|
|
uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1])); |
|
|
|
|
|
|
|
|
|
uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x); |
|
|
|
|
vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst)); |
|
|
|
|
v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8), |
|
|
|
|
v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8); |
|
|
|
|
|
|
|
|
|
v_uint8x16 v_nmask1, v_nmask2; |
|
|
|
|
v_uint8x16 v_nmask = v_load(mask + x) == v_zero; |
|
|
|
|
v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2); |
|
|
|
|
|
|
|
|
|
#if CV_SSE4_2 |
|
|
|
|
v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val)); |
|
|
|
|
v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val)); |
|
|
|
|
#else |
|
|
|
|
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1); |
|
|
|
|
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2); |
|
|
|
|
#endif |
|
|
|
|
v_store(dst + x, v_dst1); |
|
|
|
|
v_store(dst + x + 8, v_dst2); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|