applied patch by Will Lucas; improved separable filter performance (in particular cv::GaussianBlur) on 16s images (for [much] faster SIFT)

pull/13383/head
Vadim Pisarevsky 13 years ago
parent d0981a628a
commit 04c0783b2f
  1. 8
      modules/imgproc/include/opencv2/imgproc/imgproc.hpp
  2. 207
      modules/imgproc/src/filter.cpp
  3. 95
      modules/imgproc/src/phasecorr.cpp

@ -694,6 +694,14 @@ CV_EXPORTS_W void calcBackProject( InputArrayOfArrays images, const vector<int>&
const vector<float>& ranges,
double scale );
/*CV_EXPORTS void calcBackProjectPatch( const Mat* images, int nimages, const int* channels,
InputArray hist, OutputArray dst, Size patchSize,
int method, double factor=1 );
CV_EXPORTS_W void calcBackProjectPatch( InputArrayOfArrays images, const vector<int>& channels,
InputArray hist, OutputArray dst, Size patchSize,
int method, double factor=1 );*/
//! compares two histograms stored in dense arrays
CV_EXPORTS_W double compareHist( InputArray H1, InputArray H2, int method );

@ -1236,6 +1236,205 @@ struct SymmColumnSmallVec_32s16s
Mat kernel;
};
/////////////////////////////////////// 16s //////////////////////////////////
struct RowVec_16s32f
{
RowVec_16s32f() {}
RowVec_16s32f( const Mat& _kernel )
{
kernel = _kernel;
sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
}
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
{
if( !sse2_supported )
return 0;
int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
float* dst = (float*)_dst;
const float* _kx = (const float*)kernel.data;
width *= cn;
for( ; i <= width - 8; i += 8 )
{
const short* src = (const short*)_src + i;
__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
for( k = 0; k < _ksize; k++, src += cn )
{
f = _mm_load_ss(_kx+k);
f = _mm_shuffle_ps(f, f, 0);
__m128i x0i = _mm_loadu_si128((const __m128i*)src);
__m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
x0 = _mm_cvtepi32_ps(x0i);
x1 = _mm_cvtepi32_ps(x1i);
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
}
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
return i;
}
Mat kernel;
bool sse2_supported;
};
struct SymmColumnVec_32f16s
{
SymmColumnVec_32f16s() { symmetryType=0; }
SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
{
symmetryType = _symmetryType;
kernel = _kernel;
delta = (float)_delta;
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
}
int operator()(const uchar** _src, uchar* _dst, int width) const
{
if( !sse2_supported )
return 0;
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
const float* ky = (const float*)kernel.data + ksize2;
int i = 0, k;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
const float** src = (const float**)_src;
const float *S, *S2;
short* dst = (short*)_dst;
__m128 d4 = _mm_set1_ps(delta);
if( symmetrical )
{
for( ; i <= width - 16; i += 16 )
{
__m128 f = _mm_load_ss(ky);
f = _mm_shuffle_ps(f, f, 0);
__m128 s0, s1, s2, s3;
__m128 x0, x1;
S = src[0] + i;
s0 = _mm_load_ps(S);
s1 = _mm_load_ps(S+4);
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
s2 = _mm_load_ps(S+8);
s3 = _mm_load_ps(S+12);
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
for( k = 1; k <= ksize2; k++ )
{
S = src[k] + i;
S2 = src[-k] + i;
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
}
__m128i s0i = _mm_cvtps_epi32(s0);
__m128i s1i = _mm_cvtps_epi32(s1);
__m128i s2i = _mm_cvtps_epi32(s2);
__m128i s3i = _mm_cvtps_epi32(s3);
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
_mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
}
for( ; i <= width - 4; i += 4 )
{
__m128 f = _mm_load_ss(ky);
f = _mm_shuffle_ps(f, f, 0);
__m128 x0, s0 = _mm_load_ps(src[0] + i);
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
for( k = 1; k <= ksize2; k++ )
{
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
S = src[k] + i;
S2 = src[-k] + i;
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
}
__m128i s0i = _mm_cvtps_epi32(s0);
_mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
}
}
else
{
for( ; i <= width - 16; i += 16 )
{
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
__m128 x0, x1;
S = src[0] + i;
for( k = 1; k <= ksize2; k++ )
{
S = src[k] + i;
S2 = src[-k] + i;
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
}
__m128i s0i = _mm_cvtps_epi32(s0);
__m128i s1i = _mm_cvtps_epi32(s1);
__m128i s2i = _mm_cvtps_epi32(s2);
__m128i s3i = _mm_cvtps_epi32(s3);
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
_mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
}
for( ; i <= width - 4; i += 4 )
{
__m128 f, x0, s0 = d4;
for( k = 1; k <= ksize2; k++ )
{
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
}
__m128i s0i = _mm_cvtps_epi32(s0);
_mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
}
}
return i;
}
int symmetryType;
float delta;
Mat kernel;
bool sse2_supported;
};
/////////////////////////////////////// 32f //////////////////////////////////
@ -2564,7 +2763,8 @@ cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
if( sdepth == CV_16U && ddepth == CV_64F )
return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
if( sdepth == CV_16S && ddepth == CV_32F )
return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
return Ptr<BaseRowFilter>(new RowFilter<short, float, RowVec_16s32f>
(kernel, anchor, RowVec_16s32f(kernel)));
if( sdepth == CV_16S && ddepth == CV_64F )
return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
if( sdepth == CV_32F && ddepth == CV_32F )
@ -2655,8 +2855,9 @@ cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstTyp
return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
(kernel, anchor, delta, symmetryType));
if( ddepth == CV_16S && sdepth == CV_32F )
return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
(kernel, anchor, delta, symmetryType));
return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s>
(kernel, anchor, delta, symmetryType, Cast<float, short>(),
SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)));
if( ddepth == CV_16S && sdepth == CV_64F )
return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
(kernel, anchor, delta, symmetryType));

@ -84,8 +84,8 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
for( j = 1; j <= rows - 2; j += 2 )
{
dataDst[j*stepDst] = (float)((double)dataSrc[j*stepSrc]*dataSrc[j*stepSrc] +
(double)dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]);
dataDst[j*stepDst] = (float)std::sqrt((double)dataSrc[j*stepSrc]*dataSrc[j*stepSrc] +
(double)dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]);
}
if( k == 1 )
@ -104,7 +104,7 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
for( j = j0; j < j1; j += 2 )
{
dataDst[j] = (float)((double)dataSrc[j]*dataSrc[j] + (double)dataSrc[j+1]*dataSrc[j+1]);
dataDst[j] = (float)std::sqrt((double)dataSrc[j]*dataSrc[j] + (double)dataSrc[j+1]*dataSrc[j+1]);
}
}
}
@ -128,8 +128,8 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
for( j = 1; j <= rows - 2; j += 2 )
{
dataDst[j*stepDst] = dataSrc[j*stepSrc]*dataSrc[j*stepSrc] +
dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc];
dataDst[j*stepDst] = std::sqrt(dataSrc[j*stepSrc]*dataSrc[j*stepSrc] +
dataSrc[(j+1)*stepSrc]*dataSrc[(j+1)*stepSrc]);
}
if( k == 1 )
@ -148,13 +148,10 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
for( j = j0; j < j1; j += 2 )
{
dataDst[j] = dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1];
dataDst[j] = std::sqrt(dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1]);
}
}
}
// do batch sqrt to use SSE optimizations...
cv::sqrt(dst, dst);
}
static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB)
@ -197,9 +194,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst,
{
if( k == 1 )
dataA += cols - 1, dataB += cols - 1, dataC += cols - 1;
dataC[0] = dataA[0] / dataB[0];
dataC[0] = dataA[0] / (dataB[0] + eps);
if( rows % 2 == 0 )
dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB];
dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / (dataB[(rows-1)*stepB] + eps);
if( !conjB )
for( j = 1; j <= rows - 2; j += 2 )
{
@ -240,9 +237,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst,
{
if( is_1d && cn == 1 )
{
dataC[0] = dataA[0] / dataB[0];
dataC[0] = dataA[0] / (dataB[0] + eps);
if( cols % 2 == 0 )
dataC[j1] = dataA[j1] / dataB[j1];
dataC[j1] = dataA[j1] / (dataB[j1] + eps);
}
if( !conjB )
@ -282,9 +279,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst,
{
if( k == 1 )
dataA += cols - 1, dataB += cols - 1, dataC += cols - 1;
dataC[0] = dataA[0] / dataB[0];
dataC[0] = dataA[0] / (dataB[0] + eps);
if( rows % 2 == 0 )
dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / dataB[(rows-1)*stepB];
dataC[(rows-1)*stepC] = dataA[(rows-1)*stepA] / (dataB[(rows-1)*stepB] + eps);
if( !conjB )
for( j = 1; j <= rows - 2; j += 2 )
{
@ -324,9 +321,9 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst,
{
if( is_1d && cn == 1 )
{
dataC[0] = dataA[0] / dataB[0];
dataC[0] = dataA[0] / (dataB[0] + eps);
if( cols % 2 == 0 )
dataC[j1] = dataA[j1] / dataB[j1];
dataC[j1] = dataA[j1] / (dataB[j1] + eps);
}
if( !conjB )
@ -355,31 +352,57 @@ static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst,
static void fftShift(InputOutputArray _out)
{
Mat out = _out.getMat();
if(out.rows == 1 && out.cols == 1)
{
// trivially shifted.
return;
}
vector<Mat> planes;
split(out, planes);
int xMid = out.cols >> 1;
int yMid = out.rows >> 1;
for(size_t i = 0; i < planes.size(); i++)
bool is_1d = xMid == 0 || yMid == 0;
if(is_1d)
{
// perform quadrant swaps...
Mat tmp;
Mat q0(planes[i], Rect(0, 0, xMid, yMid));
Mat q1(planes[i], Rect(xMid, 0, xMid, yMid));
Mat q2(planes[i], Rect(0, yMid, xMid, yMid));
Mat q3(planes[i], Rect(xMid, yMid, xMid, yMid));
q0.copyTo(tmp);
q3.copyTo(q0);
tmp.copyTo(q3);
q1.copyTo(tmp);
q2.copyTo(q1);
tmp.copyTo(q2);
xMid = xMid + yMid;
for(size_t i = 0; i < planes.size(); i++)
{
Mat tmp;
Mat half0(planes[i], Rect(0, 0, xMid, 1));
Mat half1(planes[i], Rect(xMid, 0, xMid, 1));
half0.copyTo(tmp);
half1.copyTo(half0);
tmp.copyTo(half1);
}
}
else
{
for(size_t i = 0; i < planes.size(); i++)
{
// perform quadrant swaps...
Mat tmp;
Mat q0(planes[i], Rect(0, 0, xMid, yMid));
Mat q1(planes[i], Rect(xMid, 0, xMid, yMid));
Mat q2(planes[i], Rect(0, yMid, xMid, yMid));
Mat q3(planes[i], Rect(xMid, yMid, xMid, yMid));
q0.copyTo(tmp);
q3.copyTo(q0);
tmp.copyTo(q3);
q1.copyTo(tmp);
q2.copyTo(q1);
tmp.copyTo(q2);
}
}
merge(planes, out);
}

Loading…
Cancel
Save