From 02b2fdeda980651f6e27e4fc6417becab1e93c6e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 30 Dec 2014 00:19:05 +0300 Subject: [PATCH] SSE resize Lanczos --- modules/imgproc/src/imgwarp.cpp | 134 +++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index d952b60ea8..8a581fc498 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -894,9 +894,139 @@ struct VResizeCubicVec_32f } }; +#if CV_SSE4_1 + +struct VResizeLanczos4Vec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for( ; x <= width - 8; x += 8 ) + { + __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); + + __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); + __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); + } + + return x; + } +}; + +#else + typedef VResizeNoVec VResizeLanczos4Vec_32f16u; -typedef VResizeNoVec VResizeLanczos4Vec_32f16s; -typedef VResizeNoVec VResizeLanczos4Vec_32f; + +#endif + +struct VResizeLanczos4Vec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for( ; x <= width - 8; x += 8 ) + { + __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); + + __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); + __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); + } + + return x; + } +}; + + +struct VResizeLanczos4Vec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + float* dst = (float*)_dst; + int x = 0; + + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for( ; x <= width - 4; x += 4 ) + { + __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + _mm_storeu_ps(dst + x, v_dst); + } + + return x; + } +}; + #elif CV_NEON