|
|
|
@ -43,6 +43,7 @@ |
|
|
|
|
|
|
|
|
|
#include "precomp.hpp" |
|
|
|
|
#include "opencl_kernels_imgproc.hpp" |
|
|
|
|
#include "opencv2/core/hal/intrin.hpp" |
|
|
|
|
|
|
|
|
|
namespace cv |
|
|
|
|
{ |
|
|
|
@ -51,8 +52,8 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst ) |
|
|
|
|
{ |
|
|
|
|
int i, j; |
|
|
|
|
Size size = _cov.size(); |
|
|
|
|
#if CV_SSE |
|
|
|
|
volatile bool simd = checkHardwareSupport(CV_CPU_SSE); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool simd = hasSIMD128(); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if( _cov.isContinuous() && _dst.isContinuous() ) |
|
|
|
@ -66,44 +67,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst ) |
|
|
|
|
const float* cov = _cov.ptr<float>(i); |
|
|
|
|
float* dst = _dst.ptr<float>(i); |
|
|
|
|
j = 0; |
|
|
|
|
#if CV_SSE |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if( simd ) |
|
|
|
|
{ |
|
|
|
|
__m128 half = _mm_set1_ps(0.5f); |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
v_float32x4 half = v_setall_f32(0.5f); |
|
|
|
|
for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) |
|
|
|
|
{ |
|
|
|
|
__m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
|
|
|
|
|
__m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
|
|
|
|
|
__m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x
|
|
|
|
|
__m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x
|
|
|
|
|
__m128 a, b, c, t; |
|
|
|
|
t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1
|
|
|
|
|
c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x
|
|
|
|
|
b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3
|
|
|
|
|
c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3
|
|
|
|
|
a = _mm_movelh_ps(t, b); |
|
|
|
|
b = _mm_movehl_ps(b, t); |
|
|
|
|
a = _mm_mul_ps(a, half); |
|
|
|
|
c = _mm_mul_ps(c, half); |
|
|
|
|
t = _mm_sub_ps(a, c); |
|
|
|
|
t = _mm_add_ps(_mm_mul_ps(t, t), _mm_mul_ps(b,b)); |
|
|
|
|
a = _mm_sub_ps(_mm_add_ps(a, c), _mm_sqrt_ps(t)); |
|
|
|
|
_mm_storeu_ps(dst + j, a); |
|
|
|
|
v_float32x4 v_a, v_b, v_c, v_t; |
|
|
|
|
v_load_deinterleave(cov + j*3, v_a, v_b, v_c); |
|
|
|
|
v_a *= half; |
|
|
|
|
v_c *= half; |
|
|
|
|
v_t = v_a - v_c; |
|
|
|
|
v_t = v_muladd(v_b, v_b, (v_t * v_t)); |
|
|
|
|
v_store(dst + j, (v_a + v_c) - v_sqrt(v_t)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
float32x4_t v_half = vdupq_n_f32(0.5f); |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float32x4x3_t v_src = vld3q_f32(cov + j * 3); |
|
|
|
|
float32x4_t v_a = vmulq_f32(v_src.val[0], v_half); |
|
|
|
|
float32x4_t v_b = v_src.val[1]; |
|
|
|
|
float32x4_t v_c = vmulq_f32(v_src.val[2], v_half); |
|
|
|
|
|
|
|
|
|
float32x4_t v_t = vsubq_f32(v_a, v_c); |
|
|
|
|
v_t = vmlaq_f32(vmulq_f32(v_t, v_t), v_b, v_b); |
|
|
|
|
vst1q_f32(dst + j, vsubq_f32(vaddq_f32(v_a, v_c), cv_vsqrtq_f32(v_t))); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; j < size.width; j++ ) |
|
|
|
|
{ |
|
|
|
@ -120,8 +98,8 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) |
|
|
|
|
{ |
|
|
|
|
int i, j; |
|
|
|
|
Size size = _cov.size(); |
|
|
|
|
#if CV_SSE |
|
|
|
|
volatile bool simd = checkHardwareSupport(CV_CPU_SSE); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool simd = hasSIMD128(); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if( _cov.isContinuous() && _dst.isContinuous() ) |
|
|
|
@ -136,40 +114,21 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) |
|
|
|
|
float* dst = _dst.ptr<float>(i); |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if( simd ) |
|
|
|
|
{ |
|
|
|
|
__m128 k4 = _mm_set1_ps((float)k); |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
v_float32x4 v_k = v_setall_f32((float)k); |
|
|
|
|
|
|
|
|
|
for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) |
|
|
|
|
{ |
|
|
|
|
__m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
|
|
|
|
|
__m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
|
|
|
|
|
__m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x
|
|
|
|
|
__m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x
|
|
|
|
|
__m128 a, b, c, t; |
|
|
|
|
t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1
|
|
|
|
|
c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x
|
|
|
|
|
b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3
|
|
|
|
|
c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3
|
|
|
|
|
a = _mm_movelh_ps(t, b); |
|
|
|
|
b = _mm_movehl_ps(b, t); |
|
|
|
|
t = _mm_add_ps(a, c); |
|
|
|
|
a = _mm_sub_ps(_mm_mul_ps(a, c), _mm_mul_ps(b, b)); |
|
|
|
|
t = _mm_mul_ps(_mm_mul_ps(k4, t), t); |
|
|
|
|
a = _mm_sub_ps(a, t); |
|
|
|
|
_mm_storeu_ps(dst + j, a); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
float32x4_t v_k = vdupq_n_f32((float)k); |
|
|
|
|
v_float32x4 v_a, v_b, v_c; |
|
|
|
|
v_load_deinterleave(cov + j * 3, v_a, v_b, v_c); |
|
|
|
|
|
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float32x4x3_t v_src = vld3q_f32(cov + j * 3); |
|
|
|
|
float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2]; |
|
|
|
|
float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b); |
|
|
|
|
float32x4_t v_ac = vaddq_f32(v_a, v_c); |
|
|
|
|
vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac))); |
|
|
|
|
v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b; |
|
|
|
|
v_float32x4 v_ac = v_a + v_c; |
|
|
|
|
v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac; |
|
|
|
|
v_store(dst + j, v_dst); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
@ -272,8 +231,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, |
|
|
|
|
if (tegra::useTegra() && tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType)) |
|
|
|
|
return; |
|
|
|
|
#endif |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool haveSimd = hasSIMD128(); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
int depth = src.depth(); |
|
|
|
@ -309,44 +268,20 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, |
|
|
|
|
const float* dydata = Dy.ptr<float>(i); |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_NEON |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSimd) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dx = vld1q_f32(dxdata + j); |
|
|
|
|
float32x4_t v_dy = vld1q_f32(dydata + j); |
|
|
|
|
for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) |
|
|
|
|
{ |
|
|
|
|
v_float32x4 v_dx = v_load(dxdata + j); |
|
|
|
|
v_float32x4 v_dy = v_load(dydata + j); |
|
|
|
|
|
|
|
|
|
float32x4x3_t v_dst; |
|
|
|
|
v_dst.val[0] = vmulq_f32(v_dx, v_dx); |
|
|
|
|
v_dst.val[1] = vmulq_f32(v_dx, v_dy); |
|
|
|
|
v_dst.val[2] = vmulq_f32(v_dy, v_dy); |
|
|
|
|
v_float32x4 v_dst0, v_dst1, v_dst2; |
|
|
|
|
v_dst0 = v_dx * v_dx; |
|
|
|
|
v_dst1 = v_dx * v_dy; |
|
|
|
|
v_dst2 = v_dy * v_dy; |
|
|
|
|
|
|
|
|
|
vst3q_f32(cov_data + j * 3, v_dst); |
|
|
|
|
} |
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
{ |
|
|
|
|
for( ; j <= size.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 v_dx_0 = _mm_loadu_ps(dxdata + j); |
|
|
|
|
__m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4); |
|
|
|
|
__m128 v_dy_0 = _mm_loadu_ps(dydata + j); |
|
|
|
|
__m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4); |
|
|
|
|
|
|
|
|
|
__m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0); |
|
|
|
|
__m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0); |
|
|
|
|
__m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0); |
|
|
|
|
__m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1); |
|
|
|
|
__m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1); |
|
|
|
|
__m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1); |
|
|
|
|
|
|
|
|
|
_mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(cov_data + j * 3, v_dx2_0); |
|
|
|
|
_mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1); |
|
|
|
|
_mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0); |
|
|
|
|
_mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1); |
|
|
|
|
_mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0); |
|
|
|
|
_mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1); |
|
|
|
|
v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
@ -751,13 +686,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord |
|
|
|
|
if( src.depth() == CV_8U ) |
|
|
|
|
factor *= 255; |
|
|
|
|
factor = 1./(factor * factor * factor); |
|
|
|
|
#if CV_NEON || CV_SSE2 |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
float factor_f = (float)factor; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
__m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f); |
|
|
|
|
bool haveSimd = hasSIMD128(); |
|
|
|
|
v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
Size size = src.size(); |
|
|
|
@ -773,30 +705,21 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord |
|
|
|
|
|
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSimd) |
|
|
|
|
{ |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) |
|
|
|
|
{ |
|
|
|
|
__m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j)); |
|
|
|
|
__m128 v_dy = _mm_loadu_ps((const float *)(dydata + j)); |
|
|
|
|
|
|
|
|
|
__m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j))); |
|
|
|
|
__m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j))); |
|
|
|
|
__m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j))); |
|
|
|
|
v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2)))); |
|
|
|
|
_mm_storeu_ps(dstdata + j, v_s1); |
|
|
|
|
v_float32x4 v_dx = v_load(dxdata + j); |
|
|
|
|
v_float32x4 v_dy = v_load(dydata + j); |
|
|
|
|
|
|
|
|
|
v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j); |
|
|
|
|
v_float32x4 v_s2 = v_muladd((v_dy * v_dy), v_load(d2xdata + j), v_s1); |
|
|
|
|
v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2); |
|
|
|
|
|
|
|
|
|
v_store(dstdata + j, v_s3 * v_factor); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
for( ; j <= size.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j); |
|
|
|
|
float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); |
|
|
|
|
v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy)); |
|
|
|
|
v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2)); |
|
|
|
|
vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for( ; j < size.width; j++ ) |
|
|
|
|