From e345cb03d5c93f85673d1df832c69f241de94645 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 31 Aug 2018 16:54:05 +0300 Subject: [PATCH] Bit-exact resize reworked to use wide intrinsics (#12038) * Bit-exact resize reworked to use wide intrinsics * Reworked bit-exact resize row data loading * Added bit-exact resize row data loaders for SIMD256 and SIMD512 * Fixed type punned pointer dereferencing warning * Reworked loading of source data for SIMD256 and SIMD512 bit-exact resize --- modules/imgproc/src/resize.cpp | 399 ++++++++++++++++++++++++--------- 1 file changed, 289 insertions(+), 110 deletions(-) diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 016fff03d9..2ff84753f7 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -340,51 +340,187 @@ static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_mi hline::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width); }; +#if CV_SIMD512 +inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint16( + *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])), + *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])), + *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])), + *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), + *((uint16_t*)(src + ofst[16])), *((uint16_t*)(src + ofst[17])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), + *((uint16_t*)(src + ofst[20])), *((uint16_t*)(src + ofst[21])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), + *((uint16_t*)(src + ofst[24])), *((uint16_t*)(src + ofst[25])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])), + *((uint16_t*)(src + ofst[28])), *((uint16_t*)(src + ofst[29])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))), + v_src0, v_src1); +} +inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint32( + *((uint32_t*)(src + 2 * ofst[ 0])), *((uint32_t*)(src + 2 * ofst[ 1])), *((uint32_t*)(src + 2 * ofst[ 2])), *((uint32_t*)(src + 2 * ofst[ 3])), + *((uint32_t*)(src + 2 * ofst[ 4])), *((uint32_t*)(src + 2 * ofst[ 5])), *((uint32_t*)(src + 2 * ofst[ 6])), *((uint32_t*)(src + 2 * ofst[ 7])), + *((uint32_t*)(src + 2 * ofst[ 8])), *((uint32_t*)(src + 2 * ofst[ 9])), *((uint32_t*)(src + 2 * ofst[10])), *((uint32_t*)(src + 2 * ofst[11])), + *((uint32_t*)(src + 2 * ofst[12])), *((uint32_t*)(src + 2 * ofst[13])), *((uint32_t*)(src + 2 * ofst[14])), *((uint32_t*)(src + 2 * ofst[15])))), + v_src0, v_src1); + v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; + v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); +} +inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint64( + *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])), + *((uint64_t*)(src + 4 * ofst[4])), *((uint64_t*)(src + 4 * ofst[5])), *((uint64_t*)(src + 4 * ofst[6])), *((uint64_t*)(src + 4 * ofst[7])))), + v_src0, v_src1); + v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3; + v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); + v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1); +} +inline void v_load_indexed1(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) +{ + v_expand(v_reinterpret_as_u16(v_uint32( + *((uint32_t*)(src + ofst[ 0])), *((uint32_t*)(src + ofst[ 1])), *((uint32_t*)(src + ofst[ 2])), *((uint32_t*)(src + ofst[ 3])), + *((uint32_t*)(src + ofst[ 4])), *((uint32_t*)(src + ofst[ 5])), *((uint32_t*)(src + ofst[ 6])), *((uint32_t*)(src + ofst[ 7])), + *((uint32_t*)(src + ofst[ 8])), *((uint32_t*)(src + ofst[ 9])), *((uint32_t*)(src + ofst[10])), *((uint32_t*)(src + ofst[11])), + *((uint32_t*)(src + ofst[12])), *((uint32_t*)(src + ofst[13])), *((uint32_t*)(src + ofst[14])), *((uint32_t*)(src + ofst[15])))), + v_src0, v_src1); +} +#elif CV_SIMD256 +inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint16( + *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])), + *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])), + *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])), + *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))), + v_src0, v_src1); +} +inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint32( + *((uint32_t*)(src + 2 * ofst[0])), *((uint32_t*)(src + 2 * ofst[1])), *((uint32_t*)(src + 2 * ofst[2])), *((uint32_t*)(src + 2 * ofst[3])), + *((uint32_t*)(src + 2 * ofst[4])), *((uint32_t*)(src + 2 * ofst[5])), *((uint32_t*)(src + 2 * ofst[6])), *((uint32_t*)(src + 2 * ofst[7])))), + v_src0, v_src1); + v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; + v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); + v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1); +} +inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_expand(v_reinterpret_as_u8(v_uint64( + *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])))), + v_src0, v_src1); + v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3; + v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); +} +inline void v_load_indexed1(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) +{ + v_expand(v_reinterpret_as_u16(v_uint32( + *((uint32_t*)(src + ofst[0])), *((uint32_t*)(src + ofst[1])), *((uint32_t*)(src + ofst[2])), *((uint32_t*)(src + ofst[3])), + *((uint32_t*)(src + ofst[4])), *((uint32_t*)(src + ofst[5])), *((uint32_t*)(src + ofst[6])), *((uint32_t*)(src + ofst[7])))), + v_src0, v_src1); +} +#elif CV_SIMD128 +inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + uint16_t buf[8]; + buf[0] = *((uint16_t*)(src + ofst[0])); + buf[1] = *((uint16_t*)(src + ofst[1])); + buf[2] = *((uint16_t*)(src + ofst[2])); + buf[3] = *((uint16_t*)(src + ofst[3])); + buf[4] = *((uint16_t*)(src + ofst[4])); + buf[5] = *((uint16_t*)(src + ofst[5])); + buf[6] = *((uint16_t*)(src + ofst[6])); + buf[7] = *((uint16_t*)(src + ofst[7])); + v_src0 = vx_load_expand((uint8_t*)buf); + v_src1 = vx_load_expand((uint8_t*)buf + 8); +} +inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + uint32_t buf[4]; + buf[0] = *((uint32_t*)(src + 2 * ofst[0])); + buf[1] = *((uint32_t*)(src + 2 * ofst[1])); + buf[2] = *((uint32_t*)(src + 2 * ofst[2])); + buf[3] = *((uint32_t*)(src + 2 * ofst[3])); + v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3; + v_tmp0 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf)); + v_tmp1 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf + 8)); + v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3); + v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1); + v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1); +} +inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1) +{ + v_uint16 v_tmp0, v_tmp1; + v_src0 = vx_load_expand(src + 4 * ofst[0]); + v_src1 = vx_load_expand(src + 4 * ofst[1]); + v_recombine(v_src0, v_src1, v_tmp0, v_tmp1); + v_zip(v_tmp0, v_tmp1, v_src0, v_src1); +} +inline void v_load_indexed1(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1) +{ + uint32_t buf[4]; + buf[0] = *((uint32_t*)(src + ofst[0])); + buf[1] = *((uint32_t*)(src + ofst[1])); + buf[2] = *((uint32_t*)(src + ofst[2])); + buf[3] = *((uint32_t*)(src + ofst[3])); + v_src0 = vx_load_expand((uint16_t*)buf); + v_src1 = vx_load_expand((uint16_t*)buf + 4); +} +#endif template <> void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) { int i = 0; ufixedpoint16 src_0(src[0]); - v_uint16x8 v_src_0 = v_setall_u16(*((uint16_t*)&src_0)); - for (; i < dst_min - 7; i += 8, m += 16, dst += 8) // Points that fall left from src image so became equal to leftmost src point +#if CV_SIMD + const int VECSZ = v_uint16::nlanes; + v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0)); + for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint16_t*)dst, v_src_0); } +#endif for (; i < dst_min; i++, m += 2) { *(dst++) = src_0; } - for (; i < dst_max - 7 && ofst[i + 7] + 15 <= ofst[dst_width - 1]; i += 8, m += 16, dst += 8) +#if CV_SIMD + for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) { - v_uint32x4 v_src01 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i ])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 1]))); - v_uint32x4 v_src23 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 3]))); - v_uint32x4 v_src45 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 4])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 5]))); - v_uint32x4 v_src67 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 6])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 7]))); + v_uint16 v_src0, v_src1; + v_load_indexed1(src, ofst + i, v_src0, v_src1); - v_uint32x4 v_zip02, v_zip13, v_zip46, v_zip57; - v_zip(v_src01, v_src23, v_zip02, v_zip13); - v_zip(v_src45, v_src67, v_zip46, v_zip57); - - v_uint32x4 v_src0, v_src1; - v_zip(v_combine_low(v_zip02, v_zip46), v_combine_low(v_zip13, v_zip57), v_src0, v_src1); - - v_int16x8 v_mul0 = v_load((int16_t*)m); - v_int16x8 v_mul1 = v_load((int16_t*)m + 8); - v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0)); - v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1)); + v_int16 v_mul0 = vx_load((int16_t*)m); + v_int16 v_mul1 = vx_load((int16_t*)m + VECSZ); + v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0)); + v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1)); v_store((uint16_t*)dst, v_pack(v_res0, v_res1)); } +#endif for (; i < dst_max; i += 1, m += 2) { uint8_t* px = src + ofst[i]; *(dst++) = m[0] * px[0] + m[1] * px[1]; } src_0 = (src + ofst[dst_width - 1])[0]; - v_src_0 = v_setall_u16(*((uint16_t*)&src_0)); - for (; i < dst_width - 7; i += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point +#if CV_SIMD + v_src_0 = vx_setall_u16(*((uint16_t*)&src_0)); + for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint16_t*)dst, v_src_0); } + vx_cleanup(); +#endif for (; i < dst_width; i++) { *(dst++) = src_0; @@ -394,87 +530,109 @@ template <> void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) { int i = 0; - ufixedpoint16 srccn[8] = { src[0], src[1], src[0], src[1], src[0], src[1], src[0], src[1] }; - v_uint16x8 v_srccn = v_load((uint16_t*)srccn); - for (; i < dst_min - 3; i += 4, m += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point + union { + uint32_t d; + uint16_t w[2]; + } srccn; + ((ufixedpoint16*)(srccn.w))[0] = src[0]; + ((ufixedpoint16*)(srccn.w))[1] = src[1]; +#if CV_SIMD + const int VECSZ = v_uint16::nlanes; + v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d)); + for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint16_t*)dst, v_srccn); } +#endif for (; i < dst_min; i++, m += 2) { - *(dst++) = srccn[0]; - *(dst++) = srccn[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; } - for (; i < dst_max - 3 && ofst[i + 3] + 7 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 8) +#if CV_SIMD + for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) { - v_uint32x4 v_src0 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i ])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 1]))); - v_uint32x4 v_src1 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 3]))); - - v_uint32x4 v_zip0, v_zip1; - v_zip(v_src0, v_src1, v_zip0, v_zip1); - v_zip(v_zip0, v_zip1, v_src0, v_src1); - - v_int16x8 v_src0123, v_src4567; - v_zip(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_src1), v_src0123, v_src4567); + v_uint16 v_src0, v_src1; + v_load_indexed2(src, ofst + i, v_src0, v_src1); - v_uint32x4 v_mul = v_load((uint32_t*)m);//AaBbCcDd + v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd + v_uint32 v_zip0, v_zip1; v_zip(v_mul, v_mul, v_zip0, v_zip1);//AaAaBbBb CcCcDdDd - v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src0123, v_reinterpret_as_s16(v_zip0))); - v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src4567, v_reinterpret_as_s16(v_zip1))); + v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_zip0))); + v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_zip1))); v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2 } +#endif for (; i < dst_max; i += 1, m += 2) { uint8_t* px = src + 2 * ofst[i]; *(dst++) = m[0] * px[0] + m[1] * px[2]; *(dst++) = m[0] * px[1] + m[1] * px[3]; } - srccn[0] = (src + 2 * ofst[dst_width - 1])[0]; srccn[1] = (src + 2 * ofst[dst_width - 1])[1]; srccn[2] = (src + 2 * ofst[dst_width - 1])[0]; srccn[3] = (src + 2 * ofst[dst_width - 1])[1]; - srccn[4] = (src + 2 * ofst[dst_width - 1])[0]; srccn[5] = (src + 2 * ofst[dst_width - 1])[1]; srccn[6] = (src + 2 * ofst[dst_width - 1])[0]; srccn[7] = (src + 2 * ofst[dst_width - 1])[1]; - v_srccn = v_load((uint16_t*)srccn); - for (; i < dst_width - 3; i += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point + ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1]; +#if CV_SIMD + v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d)); + for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint16_t*)dst, v_srccn); } + vx_cleanup(); +#endif for (; i < dst_width; i++) { - *(dst++) = srccn[0]; - *(dst++) = srccn[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; } } template <> void hlineResizeCn(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width) { int i = 0; - ufixedpoint16 srccn[8] = { src[0], src[1], src[2], src[3], src[0], src[1], src[2], src[3] }; - v_uint16x8 v_srccn = v_load((uint16_t*)srccn); - for (; i < dst_min - 1; i += 2, m += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point + union { + uint64_t q; + uint16_t w[4]; + } srccn; + ((ufixedpoint16*)(srccn.w))[0] = src[0]; + ((ufixedpoint16*)(srccn.w))[1] = src[1]; + ((ufixedpoint16*)(srccn.w))[2] = src[2]; + ((ufixedpoint16*)(srccn.w))[3] = src[3]; +#if CV_SIMD + const int VECSZ = v_uint16::nlanes; + v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q)); + for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint16_t*)dst, v_srccn); } +#endif if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point { - *(dst++) = srccn[0]; - *(dst++) = srccn[1]; - *(dst++) = srccn[2]; - *(dst++) = srccn[3]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[3]; i++; m += 2; } - for (; i < dst_max - 1 && ofst[i + 1] + 3 <= ofst[dst_width - 1]; i += 2, m += 4, dst += 8) - { - v_int16x8 v_src01 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i ])); - v_int16x8 v_src23 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i + 1])); - - v_int16x8 v_tmp0, v_tmp1; - v_recombine(v_src01, v_src23, v_tmp0, v_tmp1); - v_zip(v_tmp0, v_tmp1, v_src01, v_src23); - - v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0]));//AaAaAaAa - v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[1]));//BbBbBbBb - v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src01, v_mul01)); - v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src23, v_mul23)); - v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2 +#if CV_SIMD + for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ) + { + v_uint16 v_src0, v_src1, v_src2, v_src3; + v_load_indexed4(src, ofst + i, v_src0, v_src1); + v_load_indexed4(src, ofst + i + 2, v_src2, v_src3); + + v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp; + v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd + v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd + v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb + v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd + + v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0))); + v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1))); + v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2))); + v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3))); + v_store((uint16_t*)dst , v_pack(v_res0, v_res1)); + v_store((uint16_t*)dst + VECSZ, v_pack(v_res2, v_res3)); } +#endif for (; i < dst_max; i += 1, m += 2) { uint8_t* px = src + 4 * ofst[i]; @@ -483,19 +641,22 @@ void hlineResizeCn(uint8_t* src, int, int *o *(dst++) = m[0] * px[2] + m[1] * px[6]; *(dst++) = m[0] * px[3] + m[1] * px[7]; } - srccn[0] = (src + 4 * ofst[dst_width - 1])[0]; srccn[1] = (src + 4 * ofst[dst_width - 1])[1]; srccn[2] = (src + 4 * ofst[dst_width - 1])[2]; srccn[3] = (src + 4 * ofst[dst_width - 1])[3]; - srccn[4] = (src + 4 * ofst[dst_width - 1])[0]; srccn[5] = (src + 4 * ofst[dst_width - 1])[1]; srccn[6] = (src + 4 * ofst[dst_width - 1])[2]; srccn[7] = (src + 4 * ofst[dst_width - 1])[3]; - v_srccn = v_load((uint16_t*)srccn); - for (; i < dst_width - 1; i += 2, dst += 8) // Points that fall right from src image so became equal to rightmost src point + ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1]; + ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3]; +#if CV_SIMD + v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q)); + for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point { v_store((uint16_t*)dst, v_srccn); } + vx_cleanup(); +#endif if (i < dst_width) { - *(dst++) = srccn[0]; - *(dst++) = srccn[1]; - *(dst++) = srccn[2]; - *(dst++) = srccn[3]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[0]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[1]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[2]; + *(dst++) = ((ufixedpoint16*)(srccn.w))[3]; } } template <> @@ -503,40 +664,50 @@ void hlineResizeCn(uint16_t* src, int, int { int i = 0; ufixedpoint32 src_0(src[0]); - v_uint32x4 v_src_0 = v_setall_u32(*((uint32_t*)&src_0)); - for (; i < dst_min - 3; i += 4, m += 8, dst += 4) // Points that fall left from src image so became equal to leftmost src point +#if CV_SIMD + const int VECSZ = v_uint32::nlanes; + v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0)); + for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point { v_store((uint32_t*)dst, v_src_0); } +#endif for (; i < dst_min; i++, m += 2) { *(dst++) = src_0; } - for (; i < dst_max - 3 && ofst[i + 3] + 8 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 4) +#if CV_SIMD + for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) { - v_uint32x4 v_src0 = v_combine_low(v_load_expand(src + ofst[i]), v_load_expand(src + ofst[i + 1])); - v_uint32x4 v_mul0 = v_load((uint32_t*)m); - v_uint32x4 v_src1 = v_combine_low(v_load_expand(src + ofst[i + 2]), v_load_expand(src + ofst[i + 3])); - v_uint32x4 v_mul1 = v_load((uint32_t*)m + 4); - v_uint32x4 v_res0 = v_src0 * v_mul0;//a1a2b1b2 - v_uint32x4 v_res1 = v_src1 * v_mul1;//c1c2d1d2 - v_uint32x4 v_tmp0, v_tmp1; + v_uint32 v_src0, v_src1; + v_load_indexed1(src, ofst + i, v_src0, v_src1); + + v_uint32 v_mul0 = vx_load((uint32_t*)m); + v_uint32 v_mul1 = vx_load((uint32_t*)m + 4); + + v_uint32 v_res0 = v_src0 * v_mul0;//a1a2b1b2 + v_uint32 v_res1 = v_src1 * v_mul1;//c1c2d1d2 + v_uint32 v_tmp0, v_tmp1; v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1a2c1c2 b1b2d1d2 v_zip(v_tmp0, v_tmp1, v_res0, v_res1);//a1b1a2b2 c1d1c2d2 v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1b1c1d1 a2b2c2d2 v_store((uint32_t*)dst, v_tmp0 + v_tmp1);//abcd } +#endif for (; i < dst_max; i += 1, m += 2) { uint16_t* px = src + ofst[i]; *(dst++) = m[0] * px[0] + m[1] * px[1]; } src_0 = (src + ofst[dst_width - 1])[0]; - v_src_0 = v_setall_u32(*((uint32_t*)&src_0)); +#if CV_SIMD + v_src_0 = vx_setall_u32(*((uint32_t*)&src_0)); for (; i < dst_width - 3; i += 4, dst += 4) { v_store((uint32_t*)dst, v_src_0); } + vx_cleanup(); +#endif for (; i < dst_width; i++) { *(dst++) = src_0; @@ -552,18 +723,22 @@ void vlineSet(FT* src, ET* dst, int dst_width) template <> void vlineSet(ufixedpoint16* src, uint8_t* dst, int dst_width) { - static const v_uint16x8 v_fixedRound = v_setall_u16((uint16_t)((1U << 8) >> 1)); int i = 0; - for (; i < dst_width - 15; i += 16, src += 16, dst += 16) +#if CV_SIMD + const int VECSZ = v_uint8::nlanes; + static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1)); + for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) { - v_uint16x8 v_src0 = v_load((uint16_t*)src); - v_uint16x8 v_src1 = v_load((uint16_t*)src + 8); + v_uint16 v_src0 = vx_load((uint16_t*)src); + v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2); - v_uint16x8 v_res0 = (v_src0 + v_fixedRound) >> 8; - v_uint16x8 v_res1 = (v_src1 + v_fixedRound) >> 8; + v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8; + v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8; v_store(dst, v_pack(v_res0, v_res1)); } + vx_cleanup(); +#endif for (; i < dst_width; i++) *(dst++) = *(src++); } @@ -582,36 +757,40 @@ void vlineResize(FT* src, size_t src_step, FT* m, ET* dst, int dst_width) template <> void vlineResize(ufixedpoint16* src, size_t src_step, ufixedpoint16* m, uint8_t* dst, int dst_width) { - static const v_int32x4 v_fixedRound = v_setall_s32((int32_t)((1 << 16) >> 1)); - static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1<<15)); - static const v_int8x16 v_128_16 = v_reinterpret_as_s8 (v_setall_u8 ((uint8_t) 1<<7)); - int i = 0; ufixedpoint16* src1 = src + src_step; - v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0])); - for (; i < dst_width - 15; i += 16, src += 16, src1 += 16, dst += 16) - { - v_int16x8 v_src00 = v_load((int16_t*)src); - v_int16x8 v_src10 = v_load((int16_t*)src1); - v_int16x8 v_tmp0, v_tmp1; +#if CV_SIMD + const int VECSZ = v_uint8::nlanes; + static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1)); + static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15)); + static const v_int8 v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7)); + + v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0])); + for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ) + { + v_int16 v_src00 = vx_load((int16_t*)src); + v_int16 v_src10 = vx_load((int16_t*)src1); + v_int16 v_tmp0, v_tmp1; v_zip(v_add_wrap(v_src00,v_128), v_add_wrap(v_src10,v_128), v_tmp0, v_tmp1); - v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul); - v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul); + v_int32 v_res0 = v_dotprod(v_tmp0, v_mul); + v_int32 v_res1 = v_dotprod(v_tmp1, v_mul); - v_int16x8 v_src01 = v_load((int16_t*)src + 8); - v_int16x8 v_src11 = v_load((int16_t*)src1 + 8); + v_int16 v_src01 = vx_load((int16_t*)src + VECSZ/2); + v_int16 v_src11 = vx_load((int16_t*)src1 + VECSZ/2); v_zip(v_add_wrap(v_src01,v_128), v_add_wrap(v_src11,v_128), v_tmp0, v_tmp1); - v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul); - v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul); + v_int32 v_res2 = v_dotprod(v_tmp0, v_mul); + v_int32 v_res3 = v_dotprod(v_tmp1, v_mul); - v_int8x16 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16, - (v_res1 + v_fixedRound) >> 16), - v_pack((v_res2 + v_fixedRound) >> 16, - (v_res3 + v_fixedRound) >> 16)); + v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16, + (v_res1 + v_fixedRound) >> 16), + v_pack((v_res2 + v_fixedRound) >> 16, + (v_res3 + v_fixedRound) >> 16)); v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16))); } + vx_cleanup(); +#endif for (; i < dst_width; i++) { *(dst++) = (uint8_t)(*(src++) * m[0] + *(src1++) * m[1]);