diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp index 2a7a8e72bb..6c41b45e9f 100644 --- a/modules/imgproc/src/smooth.simd.hpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -1197,6 +1197,78 @@ void hlineSmoothONa_yzy_a(const uint8_t* src, int cn, co } } } +template <> +void hlineSmoothONa_yzy_a(const uint16_t* src, int cn, const ufixedpoint32* m, int n, ufixedpoint32* dst, int len, int borderType) +{ + int pre_shift = n / 2; + int post_shift = n - pre_shift; + int i = 0; + for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border + { + for (int k = 0; k < cn; k++) + dst[k] = m[pre_shift - i] * src[k]; + if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped + for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) + { + int src_idx = borderInterpolate(j, len, borderType); + for (int k = 0; k < cn; k++) + dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; + } + int j, mid; + for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) + for (int k = 0; k < cn; k++) + dst[k] = dst[k] + m[mid] * src[j*cn + k]; + if (borderType != BORDER_CONSTANT) + for (; j < i + post_shift; j++, mid++) + { + int src_idx = borderInterpolate(j, len, borderType); + for (int k = 0; k < cn; k++) + dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; + } + } + i *= cn; + int lencn = (len - post_shift + 1)*cn; +#if CV_SIMD + const int VECSZ = v_uint32::nlanes; + for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2) + { + v_uint32 v_res0, v_res1; + v_mul_expand(vx_load(src + pre_shift * cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + pre_shift))), v_res0, v_res1); + for (int j = 0; j < pre_shift; j ++) + { + v_uint32 v_add0, v_add1; + v_mul_expand(vx_load(src + j * cn) + vx_load(src + (n - 1 - j)*cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + j))), v_add0, v_add1); + v_res0 += v_add0; + v_res1 += v_add1; + } + v_store((uint32_t*)dst, v_res0); + v_store((uint32_t*)dst + VECSZ, v_res1); + } +#endif + for (; i < lencn; i++, src++, dst++) + { + *dst = m[pre_shift] * src[pre_shift*cn]; + for (int j = 0; j < pre_shift; j++) + *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn]; + } + i /= cn; + for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border + { + for (int k = 0; k < cn; k++) + dst[k] = m[0] * src[k]; + int j = 1; + for (; j < len - i; j++) + for (int k = 0; k < cn; k++) + dst[k] = dst[k] + m[j] * src[j*cn + k]; + if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped + for (; j < n; j++) + { + int src_idx = borderInterpolate(i + j, len, borderType) - i; + for (int k = 0; k < cn; k++) + dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; + } + } +} template void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len) { @@ -1788,6 +1860,62 @@ void vlineSmoothONa_yzy_a(const ufixedpoint16* const * s dst[i] = val; } } +template <> +void vlineSmoothONa_yzy_a(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len) +{ + int i = 0; +#if CV_SIMD + int pre_shift = n / 2; + const int VECSZ = v_uint32::nlanes; + for (; i <= len - 2*VECSZ; i += 2*VECSZ) + { + v_uint32 v_src00, v_src10, v_src01, v_src11; + v_uint64 v_res0, v_res1, v_res2, v_res3; + v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7; + + v_uint32 v_mul = vx_setall_u32(*((uint32_t*)(m + pre_shift))); + const uint32_t* srcp = (const uint32_t*)src[pre_shift] + i; + v_src00 = vx_load(srcp); + v_src10 = vx_load(srcp + VECSZ); + v_mul_expand(v_src00, v_mul, v_res0, v_res1); + v_mul_expand(v_src10, v_mul, v_res2, v_res3); + + int j = 0; + for (; j < pre_shift; j++) + { + v_mul = vx_setall_u32(*((uint32_t*)(m + j))); + + const uint32_t* srcj0 = (const uint32_t*)src[j] + i; + const uint32_t* srcj1 = (const uint32_t*)src[n - 1 - j] + i; + v_src00 = vx_load(srcj0); + v_src01 = vx_load(srcj1); + v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1); + v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3); + v_res0 += v_tmp0 + v_tmp2; + v_res1 += v_tmp1 + v_tmp3; + + v_src10 = vx_load(srcj0 + VECSZ); + v_src11 = vx_load(srcj1 + VECSZ); + v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5); + v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7); + v_res2 += v_tmp4 + v_tmp6; + v_res3 += v_tmp5 + v_tmp7; + } + + v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1), + v_rshr_pack<32>(v_res2, v_res3))); + } +#endif + for (; i < len; i++) + { + ufixedpoint64 val = m[0] * src[0][i]; + for (int j = 1; j < n; j++) + { + val = val + m[j] * src[j][i]; + } + dst[i] = (uint16_t)val; + } +} template class fixedSmoothInvoker : public ParallelLoopBody {