From a2a131799fe9a7d3ba21dab00bb1056c51095a5d Mon Sep 17 00:00:00 2001 From: orestis Date: Fri, 19 Dec 2014 22:42:31 +0200 Subject: [PATCH] SymmColumnVec_32f16s NEON speedup: 8.64x Auto-vect speedup: 1x Test kernel: [0.1, 0.2408, 0.3184, 0.2408, 0.1] --- modules/imgproc/src/filter.cpp | 108 ++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index f60558c6d7..6991a448cc 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -2779,11 +2779,117 @@ struct SymmColumnSmallVec_32s16s }; +struct SymmColumnVec_32f16s +{ + SymmColumnVec_32f16s() { symmetryType=0; } + SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) + { + symmetryType = _symmetryType; + kernel = _kernel; + delta = (float)_delta; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + //Uncomment the following line when runtime support for neon is implemented. + // neon_supported = checkHardwareSupport(CV_CPU_NEON); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + //Uncomment the two following lines when runtime support for neon is implemented. + // if( !neon_supported ) + // return 0; + + int _ksize = kernel.rows + kernel.cols - 1; + int ksize2 = _ksize / 2; + const float* ky = kernel.ptr() + ksize2; + int i = 0, k; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const float** src = (const float**)_src; + const float *S, *S2; + short* dst = (short*)_dst; + + float32x4_t d4 = vdupq_n_f32(delta); + + if( symmetrical ) + { + if( _ksize == 1 ) + return 0; + + + float32x2_t k32; + k32 = vdup_n_f32(0); + k32 = vld1_lane_f32(ky, k32, 0); + k32 = vld1_lane_f32(ky + 1, k32, 1); + + for( ; i <= width - 8; i += 8 ) + { + float32x4_t x0l, x0h, x1l, x1h, x2l, x2h; + float32x4_t accl, acch; + + S = src[0] + i; + + x0l = vld1q_f32(S); + x0h = vld1q_f32(S + 4); + + S = src[1] + i; + S2 = src[-1] + i; + + x1l = vld1q_f32(S); + x1h = vld1q_f32(S + 4); + x2l = vld1q_f32(S2); + x2h = vld1q_f32(S2 + 4); + + accl = acch = d4; + accl = vmlaq_lane_f32(accl, x0l, k32, 0); + acch = vmlaq_lane_f32(acch, x0h, k32, 0); + accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1); + acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1); + + for( k = 2; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + + float32x4_t x3l, x3h, x4l, x4h; + x3l = vld1q_f32(S); + x3h = vld1q_f32(S + 4); + x4l = vld1q_f32(S2); + x4h = vld1q_f32(S2 + 4); + + accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]); + acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]); + } + + int32x4_t s32l, s32h; + s32l = vcvtq_s32_f32(accl); + s32h = vcvtq_s32_f32(acch); + + int16x4_t s16l, s16h; + s16l = vqmovn_s32(s32l); + s16h = vqmovn_s32(s32h); + + vst1_s16((int16_t *)(dst + i), s16l); + vst1_s16((int16_t *)(dst + i + 4), s16h); + } + } + else + { + return 0; + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; + bool neon_supported; +}; + + typedef RowNoVec RowVec_8u32s; typedef RowNoVec RowVec_16s32f; typedef RowNoVec RowVec_32f; typedef SymmRowSmallNoVec SymmRowSmallVec_32f; -typedef ColumnNoVec SymmColumnVec_32f16s; typedef ColumnNoVec SymmColumnVec_32f; typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; typedef FilterNoVec FilterVec_8u;