From 04c0783b2fb966321e5d1c7453d104bdccde7ed1 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 23 Jan 2012 12:14:46 +0000 Subject: [PATCH] applied patch by Will Lucas; improved separable filter performance (in particular cv::GaussianBlur) on 16s images (for [much] faster SIFT) --- .../include/opencv2/imgproc/imgproc.hpp | 8 + modules/imgproc/src/filter.cpp | 207 +++++++++++++++++- modules/imgproc/src/phasecorr.cpp | 95 +++++--- 3 files changed, 271 insertions(+), 39 deletions(-) diff --git a/modules/imgproc/include/opencv2/imgproc/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc/imgproc.hpp index 656e0a6906..6d790d7cde 100644 --- a/modules/imgproc/include/opencv2/imgproc/imgproc.hpp +++ b/modules/imgproc/include/opencv2/imgproc/imgproc.hpp @@ -694,6 +694,14 @@ CV_EXPORTS_W void calcBackProject( InputArrayOfArrays images, const vector& const vector& ranges, double scale ); +/*CV_EXPORTS void calcBackProjectPatch( const Mat* images, int nimages, const int* channels, + InputArray hist, OutputArray dst, Size patchSize, + int method, double factor=1 ); + +CV_EXPORTS_W void calcBackProjectPatch( InputArrayOfArrays images, const vector& channels, + InputArray hist, OutputArray dst, Size patchSize, + int method, double factor=1 );*/ + //! compares two histograms stored in dense arrays CV_EXPORTS_W double compareHist( InputArray H1, InputArray H2, int method ); diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 0bb577beaf..ba10588736 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -1236,6 +1236,205 @@ struct SymmColumnSmallVec_32s16s Mat kernel; }; + +/////////////////////////////////////// 16s ////////////////////////////////// + +struct RowVec_16s32f +{ + RowVec_16s32f() {} + RowVec_16s32f( const Mat& _kernel ) + { + kernel = _kernel; + sse2_supported = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + { + if( !sse2_supported ) + return 0; + + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; + float* dst = (float*)_dst; + const float* _kx = (const float*)kernel.data; + width *= cn; + + for( ; i <= width - 8; i += 8 ) + { + const short* src = (const short*)_src + i; + __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; + for( k = 0; k < _ksize; k++, src += cn ) + { + f = _mm_load_ss(_kx+k); + f = _mm_shuffle_ps(f, f, 0); + + __m128i x0i = _mm_loadu_si128((const __m128i*)src); + __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16); + x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16); + x0 = _mm_cvtepi32_ps(x0i); + x1 = _mm_cvtepi32_ps(x1i); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + } + _mm_store_ps(dst + i, s0); + _mm_store_ps(dst + i + 4, s1); + } + return i; + } + + Mat kernel; + bool sse2_supported; +}; + + +struct SymmColumnVec_32f16s +{ + SymmColumnVec_32f16s() { symmetryType=0; } + SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) + { + symmetryType = _symmetryType; + kernel = _kernel; + delta = (float)_delta; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + sse2_supported = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + if( !sse2_supported ) + return 0; + + int ksize2 = (kernel.rows + kernel.cols - 1)/2; + const float* ky = (const float*)kernel.data + ksize2; + int i = 0, k; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const float** src = (const float**)_src; + const float *S, *S2; + short* dst = (short*)_dst; + __m128 d4 = _mm_set1_ps(delta); + + if( symmetrical ) + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128 s0, s1, s2, s3; + __m128 x0, x1; + S = src[0] + i; + s0 = _mm_load_ps(S); + s1 = _mm_load_ps(S+4); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); + s2 = _mm_load_ps(S+8); + s3 = _mm_load_ps(S+12); + s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); + s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); + x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); + x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); + s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + } + + __m128i s0i = _mm_cvtps_epi32(s0); + __m128i s1i = _mm_cvtps_epi32(s1); + __m128i s2i = _mm_cvtps_epi32(s2); + __m128i s3i = _mm_cvtps_epi32(s3); + + _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); + _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128 x0, s0 = _mm_load_ps(src[0] + i); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + S = src[k] + i; + S2 = src[-k] + i; + x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + __m128i s0i = _mm_cvtps_epi32(s0); + _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); + } + } + else + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; + __m128 x0, x1; + S = src[0] + i; + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); + x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); + x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); + s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + } + + __m128i s0i = _mm_cvtps_epi32(s0); + __m128i s1i = _mm_cvtps_epi32(s1); + __m128i s2i = _mm_cvtps_epi32(s2); + __m128i s3i = _mm_cvtps_epi32(s3); + + _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); + _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f, x0, s0 = d4; + + for( k = 1; k <= ksize2; k++ ) + { + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + __m128i s0i = _mm_cvtps_epi32(s0); + _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); + } + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; + bool sse2_supported; +}; + /////////////////////////////////////// 32f ////////////////////////////////// @@ -2564,7 +2763,8 @@ cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, if( sdepth == CV_16U && ddepth == CV_64F ) return Ptr(new RowFilter(kernel, anchor)); if( sdepth == CV_16S && ddepth == CV_32F ) - return Ptr(new RowFilter(kernel, anchor)); + return Ptr(new RowFilter + (kernel, anchor, RowVec_16s32f(kernel))); if( sdepth == CV_16S && ddepth == CV_64F ) return Ptr(new RowFilter(kernel, anchor)); if( sdepth == CV_32F && ddepth == CV_32F ) @@ -2655,8 +2855,9 @@ cv::Ptr cv::getLinearColumnFilter( int bufType, int dstTyp return Ptr(new SymmColumnFilter, ColumnNoVec> (kernel, anchor, delta, symmetryType)); if( ddepth == CV_16S && sdepth == CV_32F ) - return Ptr(new SymmColumnFilter, ColumnNoVec> - (kernel, anchor, delta, symmetryType)); + return Ptr(new SymmColumnFilter, SymmColumnVec_32f16s> + (kernel, anchor, delta, symmetryType, Cast(), + SymmColumnVec_32f16s(kernel, symmetryType, 0, delta))); if( ddepth == CV_16S && sdepth == CV_64F ) return Ptr(new SymmColumnFilter, ColumnNoVec> (kernel, anchor, delta, symmetryType)); diff --git a/modules/imgproc/src/phasecorr.cpp b/modules/imgproc/src/phasecorr.cpp index 70f2d8f9b6..130fe91fa4 100644 --- a/modules/imgproc/src/phasecorr.cpp +++ b/modules/imgproc/src/phasecorr.cpp @@ -84,8 +84,8 @@ static void magSpectrums( InputArray _src, OutputArray _dst) for( j = 1; j <= rows - 2; j += 2 ) { - dataDst[j*stepDst] = (float)((double)dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + - (double)dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]); + dataDst[j*stepDst] = (float)std::sqrt((double)dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + + (double)dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]); } if( k == 1 ) @@ -104,7 +104,7 @@ static void magSpectrums( InputArray _src, OutputArray _dst) for( j = j0; j < j1; j += 2 ) { - dataDst[j] = (float)((double)dataSrc[j]*dataSrc[j] + (double)dataSrc[j+1]*dataSrc[j+1]); + dataDst[j] = (float)std::sqrt((double)dataSrc[j]*dataSrc[j] + (double)dataSrc[j+1]*dataSrc[j+1]); } } } @@ -128,8 +128,8 @@ static void magSpectrums( InputArray _src, OutputArray _dst) for( j = 1; j <= rows - 2; j += 2 ) { - dataDst[j*stepDst] = dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + - dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]; + dataDst[j*stepDst] = std::sqrt(dataSrc[j*stepSrc]*dataSrc[j*stepSrc] + + dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]); } if( k == 1 ) @@ -148,13 +148,10 @@ static void magSpectrums( InputArray _src, OutputArray _dst) for( j = j0; j < j1; j += 2 ) { - dataDst[j] = dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1]; + dataDst[j] = std::sqrt(dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1]); } } } - - // do batch sqrt to use SSE optimizations... - cv::sqrt(dst, dst); } static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB) @@ -197,9 +194,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, { if( k == 1 ) dataA += cols - 1, dataB += cols - 1, dataC += cols - 1; - dataC[0] = dataA[0] / dataB[0]; + dataC[0] = dataA[0] / (dataB[0] + eps); if( rows % 2 == 0 ) - dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB]; + dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / (dataB[(rows-1)*stepB] + eps); if( !conjB ) for( j = 1; j <= rows - 2; j += 2 ) { @@ -240,9 +237,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, { if( is_1d && cn == 1 ) { - dataC[0] = dataA[0] / dataB[0]; + dataC[0] = dataA[0] / (dataB[0] + eps); if( cols % 2 == 0 ) - dataC[j1] = dataA[j1] / dataB[j1]; + dataC[j1] = dataA[j1] / (dataB[j1] + eps); } if( !conjB ) @@ -282,9 +279,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, { if( k == 1 ) dataA += cols - 1, dataB += cols - 1, dataC += cols - 1; - dataC[0] = dataA[0] / dataB[0]; + dataC[0] = dataA[0] / (dataB[0] + eps); if( rows % 2 == 0 ) - dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB]; + dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / (dataB[(rows-1)*stepB] + eps); if( !conjB ) for( j = 1; j <= rows - 2; j += 2 ) { @@ -324,9 +321,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, { if( is_1d && cn == 1 ) { - dataC[0] = dataA[0] / dataB[0]; + dataC[0] = dataA[0] / (dataB[0] + eps); if( cols % 2 == 0 ) - dataC[j1] = dataA[j1] / dataB[j1]; + dataC[j1] = dataA[j1] / (dataB[j1] + eps); } if( !conjB ) @@ -355,31 +352,57 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, static void fftShift(InputOutputArray _out) { Mat out = _out.getMat(); - + + if(out.rows == 1 && out.cols == 1) + { + // trivially shifted. + return; + } + vector planes; split(out, planes); - + int xMid = out.cols >> 1; int yMid = out.rows >> 1; - - for(size_t i = 0; i < planes.size(); i++) + + bool is_1d = xMid == 0 || yMid == 0; + + if(is_1d) { - // perform quadrant swaps... - Mat tmp; - Mat q0(planes[i], Rect(0, 0, xMid, yMid)); - Mat q1(planes[i], Rect(xMid, 0, xMid, yMid)); - Mat q2(planes[i], Rect(0, yMid, xMid, yMid)); - Mat q3(planes[i], Rect(xMid, yMid, xMid, yMid)); - - q0.copyTo(tmp); - q3.copyTo(q0); - tmp.copyTo(q3); - - q1.copyTo(tmp); - q2.copyTo(q1); - tmp.copyTo(q2); + xMid = xMid + yMid; + + for(size_t i = 0; i < planes.size(); i++) + { + Mat tmp; + Mat half0(planes[i], Rect(0, 0, xMid, 1)); + Mat half1(planes[i], Rect(xMid, 0, xMid, 1)); + + half0.copyTo(tmp); + half1.copyTo(half0); + tmp.copyTo(half1); + } } - + else + { + for(size_t i = 0; i < planes.size(); i++) + { + // perform quadrant swaps... + Mat tmp; + Mat q0(planes[i], Rect(0, 0, xMid, yMid)); + Mat q1(planes[i], Rect(xMid, 0, xMid, yMid)); + Mat q2(planes[i], Rect(0, yMid, xMid, yMid)); + Mat q3(planes[i], Rect(xMid, yMid, xMid, yMid)); + + q0.copyTo(tmp); + q3.copyTo(q0); + tmp.copyTo(q3); + + q1.copyTo(tmp); + q2.copyTo(q1); + tmp.copyTo(q2); + } + } + merge(planes, out); }