From 654bdde8ed5ca4fb27de61d63344055958e24ed2 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 30 Jun 2014 01:47:51 +0400 Subject: [PATCH] SSE2 optimization of cv::preCornerDetect --- modules/imgproc/src/corner.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index eeb20fbc16..923d78b30f 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -608,6 +608,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord factor *= 255; factor = 1./(factor * factor * factor); +#if CV_SSE2 + volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); + __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f); +#endif + Size size = src.size(); int i, j; for( i = 0; i < size.height; i++ ) @@ -619,7 +624,26 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord const float* d2ydata = (const float*)(D2y.data + i*D2y.step); const float* dxydata = (const float*)(Dxy.data + i*Dxy.step); - for( j = 0; j < size.width; j++ ) + j = 0; + +#if CV_SSE2 + if (haveSSE2) + { + for( ; j <= size.width - 4; j += 4 ) + { + __m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j)); + __m128 v_dy = _mm_loadu_ps((const float *)(dydata + j)); + + __m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j))); + __m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j))); + __m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j))); + v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2)))); + _mm_storeu_ps(dstdata + j, v_s1); + } + } +#endif + + for( ; j < size.width; j++ ) { float dx = dxdata[j]; float dy = dydata[j];