diff --git a/modules/features2d/src/fast_score.cpp b/modules/features2d/src/fast_score.cpp index de697b7c90..9076e0b53a 100644 --- a/modules/features2d/src/fast_score.cpp +++ b/modules/features2d/src/fast_score.cpp @@ -42,7 +42,7 @@ The references are: */ #include "fast_score.hpp" - +#include "opencv2/core/hal/intrin.hpp" #define VERIFY_CORNERS 0 namespace cv { @@ -125,80 +125,83 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold) for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); -#if CV_SSE2 - __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); - for( k = 0; k < 16; k += 8 ) +#if CV_SIMD128 + if (hasSIMD128()) { - __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); - __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); - __m128i a = _mm_min_epi16(v0, v1); - __m128i b = _mm_max_epi16(v0, v1); - v0 = _mm_loadu_si128((__m128i*)(d+k+3)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+4)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+5)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+6)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+7)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+8)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k)); - q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); - q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); - v0 = _mm_loadu_si128((__m128i*)(d+k+9)); - q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); - q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); + v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000); + for (k = 0; k < 16; k += 8) + { + v_int16x8 v0 = v_load(d + k + 1); + v_int16x8 v1 = v_load(d + k + 2); + v_int16x8 a = v_min(v0, v1); + v_int16x8 b = v_max(v0, v1); + v0 = v_load(d + k + 3); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 4); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 5); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 6); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 7); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 8); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k); + q0 = v_max(q0, v_min(a, v0)); + q1 = v_min(q1, v_max(b, v0)); + v0 = v_load(d + k + 9); + q0 = v_max(q0, v_min(a, v0)); + q1 = v_min(q1, v_max(b, v0)); + } + q0 = v_max(q0, v_setzero_s16() - q1); + threshold = v_reduce_max(q0) - 1; } - q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); - q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); - threshold = (short)_mm_cvtsi128_si32(q0) - 1; -#else - int a0 = threshold; - for( k = 0; k < 16; k += 2 ) + else +#endif { - int a = std::min((int)d[k+1], (int)d[k+2]); - a = std::min(a, (int)d[k+3]); - if( a <= a0 ) - continue; - a = std::min(a, (int)d[k+4]); - a = std::min(a, (int)d[k+5]); - a = std::min(a, (int)d[k+6]); - a = std::min(a, (int)d[k+7]); - a = std::min(a, (int)d[k+8]); - a0 = std::max(a0, std::min(a, (int)d[k])); - a0 = std::max(a0, std::min(a, (int)d[k+9])); - } - int b0 = -a0; - for( k = 0; k < 16; k += 2 ) - { - int b = std::max((int)d[k+1], (int)d[k+2]); - b = std::max(b, (int)d[k+3]); - b = std::max(b, (int)d[k+4]); - b = std::max(b, (int)d[k+5]); - if( b >= b0 ) - continue; - b = std::max(b, (int)d[k+6]); - b = std::max(b, (int)d[k+7]); - b = std::max(b, (int)d[k+8]); - - b0 = std::min(b0, std::max(b, (int)d[k])); - b0 = std::min(b0, std::max(b, (int)d[k+9])); - } + int a0 = threshold; + for( k = 0; k < 16; k += 2 ) + { + int a = std::min((int)d[k+1], (int)d[k+2]); + a = std::min(a, (int)d[k+3]); + if( a <= a0 ) + continue; + a = std::min(a, (int)d[k+4]); + a = std::min(a, (int)d[k+5]); + a = std::min(a, (int)d[k+6]); + a = std::min(a, (int)d[k+7]); + a = std::min(a, (int)d[k+8]); + a0 = std::max(a0, std::min(a, (int)d[k])); + a0 = std::max(a0, std::min(a, (int)d[k+9])); + } - threshold = -b0-1; -#endif + int b0 = -a0; + for( k = 0; k < 16; k += 2 ) + { + int b = std::max((int)d[k+1], (int)d[k+2]); + b = std::max(b, (int)d[k+3]); + b = std::max(b, (int)d[k+4]); + b = std::max(b, (int)d[k+5]); + if( b >= b0 ) + continue; + b = std::max(b, (int)d[k+6]); + b = std::max(b, (int)d[k+7]); + b = std::max(b, (int)d[k+8]); + + b0 = std::min(b0, std::max(b, (int)d[k])); + b0 = std::min(b0, std::max(b, (int)d[k+9])); + } + + threshold = -b0 - 1; + } #if VERIFY_CORNERS testCorner(ptr, pixel, K, N, threshold); @@ -214,76 +217,77 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) short d[N + 4]; for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); -#if CV_SSE2 +#if CV_SIMD128 for( k = 0; k < 4; k++ ) d[N+k] = d[k]; #endif -#if CV_SSE2 - __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); - for( k = 0; k < 16; k += 8 ) +#if CV_SIMD128 + if (hasSIMD128()) { - __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); - __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); - __m128i a = _mm_min_epi16(v0, v1); - __m128i b = _mm_max_epi16(v0, v1); - v0 = _mm_loadu_si128((__m128i*)(d+k+3)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+4)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+5)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k+6)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+k)); - q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); - q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); - v0 = _mm_loadu_si128((__m128i*)(d+k+7)); - q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); - q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); - } - q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); - q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); - threshold = (short)_mm_cvtsi128_si32(q0) - 1; -#else - int a0 = threshold; - for( k = 0; k < 12; k += 2 ) - { - int a = std::min((int)d[k+1], (int)d[k+2]); - if( a <= a0 ) - continue; - a = std::min(a, (int)d[k+3]); - a = std::min(a, (int)d[k+4]); - a = std::min(a, (int)d[k+5]); - a = std::min(a, (int)d[k+6]); - a0 = std::max(a0, std::min(a, (int)d[k])); - a0 = std::max(a0, std::min(a, (int)d[k+7])); + v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000); + for (k = 0; k < 16; k += 8) + { + v_int16x8 v0 = v_load(d + k + 1); + v_int16x8 v1 = v_load(d + k + 2); + v_int16x8 a = v_min(v0, v1); + v_int16x8 b = v_max(v0, v1); + v0 = v_load(d + k + 3); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 4); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 5); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k + 6); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + k); + q0 = v_max(q0, v_min(a, v0)); + q1 = v_min(q1, v_max(b, v0)); + v0 = v_load(d + k + 7); + q0 = v_max(q0, v_min(a, v0)); + q1 = v_min(q1, v_max(b, v0)); + } + q0 = v_max(q0, v_setzero_s16() - q1); + threshold = v_reduce_max(q0) - 1; } - - int b0 = -a0; - for( k = 0; k < 12; k += 2 ) + else +#endif { - int b = std::max((int)d[k+1], (int)d[k+2]); - b = std::max(b, (int)d[k+3]); - b = std::max(b, (int)d[k+4]); - if( b >= b0 ) - continue; - b = std::max(b, (int)d[k+5]); - b = std::max(b, (int)d[k+6]); - - b0 = std::min(b0, std::max(b, (int)d[k])); - b0 = std::min(b0, std::max(b, (int)d[k+7])); - } + int a0 = threshold; + for( k = 0; k < 12; k += 2 ) + { + int a = std::min((int)d[k+1], (int)d[k+2]); + if( a <= a0 ) + continue; + a = std::min(a, (int)d[k+3]); + a = std::min(a, (int)d[k+4]); + a = std::min(a, (int)d[k+5]); + a = std::min(a, (int)d[k+6]); + a0 = std::max(a0, std::min(a, (int)d[k])); + a0 = std::max(a0, std::min(a, (int)d[k+7])); + } - threshold = -b0-1; -#endif + int b0 = -a0; + for( k = 0; k < 12; k += 2 ) + { + int b = std::max((int)d[k+1], (int)d[k+2]); + b = std::max(b, (int)d[k+3]); + b = std::max(b, (int)d[k+4]); + if( b >= b0 ) + continue; + b = std::max(b, (int)d[k+5]); + b = std::max(b, (int)d[k+6]); + + b0 = std::min(b0, std::max(b, (int)d[k])); + b0 = std::min(b0, std::max(b, (int)d[k+7])); + } + threshold = -b0-1; + } #if VERIFY_CORNERS testCorner(ptr, pixel, K, N, threshold); #endif @@ -293,62 +297,64 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) template<> int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) { - const int K = 4, N = K*3 + 1; + const int K = 4, N = K * 3 + 1; int k, v = ptr[0]; short d[N]; - for( k = 0; k < N; k++ ) + for (k = 0; k < N; k++) d[k] = (short)(v - ptr[pixel[k]]); -#if CV_SSE2 - __m128i v0 = _mm_loadu_si128((__m128i*)(d+1)); - __m128i v1 = _mm_loadu_si128((__m128i*)(d+2)); - __m128i a = _mm_min_epi16(v0, v1); - __m128i b = _mm_max_epi16(v0, v1); - v0 = _mm_loadu_si128((__m128i*)(d+3)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+4)); - a = _mm_min_epi16(a, v0); - b = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d)); - __m128i q0 = _mm_min_epi16(a, v0); - __m128i q1 = _mm_max_epi16(b, v0); - v0 = _mm_loadu_si128((__m128i*)(d+5)); - q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); - q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); - q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); - q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); - q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); - threshold = (short)_mm_cvtsi128_si32(q0) - 1; -#else - int a0 = threshold; - for( k = 0; k < 8; k += 2 ) +#if CV_SIMD128 + if (hasSIMD128()) { - int a = std::min((int)d[k+1], (int)d[k+2]); - if( a <= a0 ) - continue; - a = std::min(a, (int)d[k+3]); - a = std::min(a, (int)d[k+4]); - a0 = std::max(a0, std::min(a, (int)d[k])); - a0 = std::max(a0, std::min(a, (int)d[k+5])); + v_int16x8 v0 = v_load(d + 1); + v_int16x8 v1 = v_load(d + 2); + v_int16x8 a = v_min(v0, v1); + v_int16x8 b = v_max(v0, v1); + v0 = v_load(d + 3); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d + 4); + a = v_min(a, v0); + b = v_max(b, v0); + v0 = v_load(d); + v_int16x8 q0 = v_min(a, v0); + v_int16x8 q1 = v_max(b, v0); + v0 = v_load(d + 5); + q0 = v_max(q0, v_min(a, v0)); + q1 = v_min(q1, v_max(b, v0)); + q0 = v_max(q0, v_setzero_s16() - q1); + threshold = v_reduce_max(q0) - 1; } - - int b0 = -a0; - for( k = 0; k < 8; k += 2 ) + else +#endif { - int b = std::max((int)d[k+1], (int)d[k+2]); - b = std::max(b, (int)d[k+3]); - if( b >= b0 ) - continue; - b = std::max(b, (int)d[k+4]); - - b0 = std::min(b0, std::max(b, (int)d[k])); - b0 = std::min(b0, std::max(b, (int)d[k+5])); - } + int a0 = threshold; + for( k = 0; k < 8; k += 2 ) + { + int a = std::min((int)d[k+1], (int)d[k+2]); + if( a <= a0 ) + continue; + a = std::min(a, (int)d[k+3]); + a = std::min(a, (int)d[k+4]); + a0 = std::max(a0, std::min(a, (int)d[k])); + a0 = std::max(a0, std::min(a, (int)d[k+5])); + } - threshold = -b0-1; -#endif + int b0 = -a0; + for( k = 0; k < 8; k += 2 ) + { + int b = std::max((int)d[k+1], (int)d[k+2]); + b = std::max(b, (int)d[k+3]); + if( b >= b0 ) + continue; + b = std::max(b, (int)d[k+4]); + + b0 = std::min(b0, std::max(b, (int)d[k])); + b0 = std::min(b0, std::max(b, (int)d[k+5])); + } + + threshold = -b0-1; + } #if VERIFY_CORNERS testCorner(ptr, pixel, K, N, threshold);