core: replace raw intrinsics with universal intrinsics in copy.cpp

- use universal intrinsic instead of raw intrinsic
- add performance check for Mat::copyTo/setTo with mask
pull/10406/head
Sayed Adel 7 years ago
parent 1bc1f3d311
commit fd0ac962fb
  1. 43
      modules/core/perf/perf_mat.cpp
  2. 93
      modules/core/src/copy.cpp

@ -57,7 +57,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone,
Size size = get<0>(GetParam()); Size size = get<0>(GetParam());
int type = get<1>(GetParam()); int type = get<1>(GetParam());
Mat source(size.height, size.width, type); Mat source(size.height, size.width, type);
Mat destination(size.height, size.width, type);; Mat destination(size.height, size.width, type);
declare.in(source, WARMUP_RNG).out(destination); declare.in(source, WARMUP_RNG).out(destination);
@ -97,6 +97,47 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
SANITY_CHECK(destination, 1); SANITY_CHECK(destination, 1);
} }
PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC2))
)
{
const Size_MatType_t params = GetParam();
const Size size = get<0>(params);
const int type = get<1>(params);
Mat src(size, type), dst(size, type), mask(size, CV_8UC1);
declare.in(src, mask, WARMUP_RNG).out(dst);
TEST_CYCLE()
{
src.copyTo(dst, mask);
}
SANITY_CHECK(dst);
}
PERF_TEST_P(Size_MatType, Mat_SetToWithMask,
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC2))
)
{
const Size_MatType_t params = GetParam();
const Size size = get<0>(params);
const int type = get<1>(params);
const Scalar sc = Scalar::all(27);
Mat src(size, type), mask(size, CV_8UC1);
declare.in(src, mask, WARMUP_RNG).out(src);
TEST_CYCLE()
{
src.setTo(sc, mask);
}
SANITY_CHECK(src);
}
///////////// Transform //////////////////////// ///////////// Transform ////////////////////////
PERF_TEST_P(Size_MatType, Mat_Transform, PERF_TEST_P(Size_MatType, Mat_Transform,

@ -90,28 +90,27 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
const uchar* src = (const uchar*)_src; const uchar* src = (const uchar*)_src;
uchar* dst = (uchar*)_dst; uchar* dst = (uchar*)_dst;
int x = 0; int x = 0;
#if CV_SSE4_2 #if CV_SIMD128
if(USE_SSE4_2)// if( hasSIMD128()
{ #if CV_SSE4_2
__m128i zero = _mm_setzero_si128 (); && USE_SSE4_2
#endif
for( ; x <= size.width - 16; x += 16 ) ) {
{ v_uint8x16 v_zero = v_setzero_u8();
const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
__m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); for( ; x <= size.width - 16; x += 16 )
__m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); {
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero); v_uint8x16 v_src = v_load(src + x),
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); v_dst = v_load(dst + x),
_mm_storeu_si128((__m128i*)(dst + x), rDst); v_nmask = v_load(mask + x) == v_zero;
}
} #if CV_SSE4_2
#elif CV_NEON v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val));
uint8x16_t v_one = vdupq_n_u8(1); #else
for( ; x <= size.width - 16; x += 16 ) v_dst = v_select(v_nmask, v_dst, v_src);
{ #endif
uint8x16_t v_mask = vcgeq_u8(vld1q_u8(mask + x), v_one); v_store(dst + x, v_dst);
uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x); }
vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst));
} }
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
@ -130,31 +129,33 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
const ushort* src = (const ushort*)_src; const ushort* src = (const ushort*)_src;
ushort* dst = (ushort*)_dst; ushort* dst = (ushort*)_dst;
int x = 0; int x = 0;
#if CV_SSE4_2 #if CV_SIMD128
if(USE_SSE4_2)// if( hasSIMD128()
{ #if CV_SSE4_2
__m128i zero = _mm_setzero_si128 (); && USE_SSE4_2
for( ; x <= size.width - 8; x += 8 ) #endif
) {
v_uint8x16 v_zero = v_setzero_u8();
for( ; x <= size.width - 16; x += 16 )
{ {
const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x)); v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8),
__m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x)); v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8);
_mask = _mm_unpacklo_epi8(_mask, _mask);
__m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x)); v_uint8x16 v_nmask1, v_nmask2;
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero); v_uint8x16 v_nmask = v_load(mask + x) == v_zero;
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
_mm_storeu_si128((__m128i*)(dst + x), rDst);
} #if CV_SSE4_2
} v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val));
#elif CV_NEON v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val));
uint8x8_t v_one = vdup_n_u8(1); #else
for( ; x <= size.width - 8; x += 8 ) v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
{ v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
uint8x8_t v_mask = vcge_u8(vld1_u8(mask + x), v_one); #endif
uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask); v_store(dst + x, v_dst1);
uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1])); v_store(dst + x + 8, v_dst2);
}
uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x);
vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst));
} }
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )

Loading…
Cancel
Save