diff --git a/modules/features2d/src/fast.avx2.cpp b/modules/features2d/src/fast.avx2.cpp new file mode 100644 index 0000000000..b319ae1c9f --- /dev/null +++ b/modules/features2d/src/fast.avx2.cpp @@ -0,0 +1,184 @@ +/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten. + Below is the original copyright and the references */ + +/* +Copyright (c) 2006, 2008 Edward Rosten +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +The references are: + * Machine learning for high-speed corner detection, + E. Rosten and T. Drummond, ECCV 2006 + * Faster and better: A machine learning approach to corner detection + E. Rosten, R. Porter and T. Drummond, PAMI, 2009 +*/ + +#include "precomp.hpp" +#include "fast.hpp" +#include "opencv2/core/hal/intrin.hpp" + +namespace cv +{ +namespace opt_AVX2 +{ + +class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2 +{ +public: + FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel): + cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel) + { + //patternSize = 16 + t256c = (char)_threshold; + threshold = std::min(std::max(_threshold, 0), 255); + } + + virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) + { + static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8)); + const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c)); + for (; j < cols - 32 - 3; j += 32, ptr += 32) + { + __m256i m0, m1; + __m256i v0 = _mm256_loadu_si256((const __m256i*)ptr); + + __m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256); + v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256); + + __m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256); + __m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256); + __m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256); + __m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256); + + m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0)); + m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1)); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2))); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3))); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0))); + m0 = _mm256_or_si256(m0, m1); + + unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important! + if (mask == 0){ + continue; + } + if ((mask & 0xffff) == 0) + { + j -= 16; + ptr -= 16; + continue; + } + + __m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0; + for (int k = 0; k < 25; k++) + { + __m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256); + m0 = _mm256_cmpgt_epi8(x, v0); + m1 = _mm256_cmpgt_epi8(v1, x); + + c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0); + c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1); + + max0 = _mm256_max_epu8(max0, c0); + max1 = _mm256_max_epu8(max1, c1); + } + + max0 = _mm256_max_epu8(max0, max1); + unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256)); + + for (int k = 0; m > 0 && k < 32; k++, m >>= 1) + if (m & 1) + { + cornerpos[ncorners++] = j + k; + if (nonmax_suppression) + { + short d[25]; + for (int q = 0; q < 25; q++) + d[q] = (short)(ptr[k] - ptr[k + pixel[q]]); + v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000); + for (int q = 0; q < 16; q += 8) + { + v_int16x8 v0_ = v_load(d + q + 1); + v_int16x8 v1_ = v_load(d + q + 2); + v_int16x8 a = v_min(v0_, v1_); + v_int16x8 b = v_max(v0_, v1_); + v0_ = v_load(d + q + 3); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q + 4); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q + 5); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q + 6); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q + 7); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q + 8); + a = v_min(a, v0_); + b = v_max(b, v0_); + v0_ = v_load(d + q); + q0 = v_max(q0, v_min(a, v0_)); + q1 = v_min(q1, v_max(b, v0_)); + v0_ = v_load(d + q + 9); + q0 = v_max(q0, v_min(a, v0_)); + q1 = v_min(q1, v_max(b, v0_)); + } + q0 = v_max(q0, v_setzero_s16() - q1); + curr[j + k] = (uchar)(v_reduce_max(q0) - 1); + } + } + } + _mm256_zeroupper(); + } + + virtual ~FAST_t_patternSize16_AVX2_Impl() {}; + +private: + int cols; + char t256c; + int threshold; + bool nonmax_suppression; + const int* pixel; +}; + +Ptr FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel) +{ + return Ptr(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel)); +} + +} +} diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index e939f065b1..8607f34a2e 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -42,6 +42,7 @@ The references are: */ #include "precomp.hpp" +#include "fast.hpp" #include "fast_score.hpp" #include "opencl_kernels_features2d.hpp" #include "opencv2/core/hal/intrin.hpp" @@ -59,13 +60,20 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; + int i, j, k, pixel[25]; + makeOffsets(pixel, (int)img.step, patternSize); + #if CV_SIMD128 const int quarterPatternSize = patternSize/4; v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K); bool hasSimd = hasSIMD128(); +#if CV_TRY_AVX2 + Ptr fast_t_impl_avx2; + if(CV_CPU_HAS_SUPPORT_AVX2) + fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel); +#endif + #endif - int i, j, k, pixel[25]; - makeOffsets(pixel, (int)img.step, patternSize); keypoints.clear(); @@ -100,65 +108,72 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo { if( patternSize == 16 ) { - for(; j < img.cols - 16 - 3; j += 16, ptr += 16) +#if CV_TRY_AVX2 + if (fast_t_impl_avx2) + fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners); +#endif + //vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements { - v_uint8x16 v = v_load(ptr); - v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); - v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); - - v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); - v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); - v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta)); - v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); - - v_int8x16 m0, m1; - m0 = (v0 < x0) & (v0 < x1); - m1 = (x0 < v1) & (x1 < v1); - m0 = m0 | ((v0 < x1) & (v0 < x2)); - m1 = m1 | ((x1 < v1) & (x2 < v1)); - m0 = m0 | ((v0 < x2) & (v0 < x3)); - m1 = m1 | ((x2 < v1) & (x3 < v1)); - m0 = m0 | ((v0 < x3) & (v0 < x0)); - m1 = m1 | ((x3 < v1) & (x0 < v1)); - m0 = m0 | m1; - - int mask = v_signmask(m0); - if( mask == 0 ) - continue; - if( (mask & 255) == 0 ) + for (; j < img.cols - 16 - 3; j += 16, ptr += 16) { - j -= 8; - ptr -= 8; - continue; - } + v_uint8x16 v = v_load(ptr); + v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); + v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); + + v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); + v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); + v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta)); + v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); + + v_int8x16 m0, m1; + m0 = (v0 < x0) & (v0 < x1); + m1 = (x0 < v1) & (x1 < v1); + m0 = m0 | ((v0 < x1) & (v0 < x2)); + m1 = m1 | ((x1 < v1) & (x2 < v1)); + m0 = m0 | ((v0 < x2) & (v0 < x3)); + m1 = m1 | ((x2 < v1) & (x3 < v1)); + m0 = m0 | ((v0 < x3) & (v0 < x0)); + m1 = m1 | ((x3 < v1) & (x0 < v1)); + m0 = m0 | m1; + + int mask = v_signmask(m0); + if( mask == 0 ) + continue; + if( (mask & 255) == 0 ) + { + j -= 8; + ptr -= 8; + continue; + } - v_int8x16 c0 = v_setzero_s8(); - v_int8x16 c1 = v_setzero_s8(); - v_uint8x16 max0 = v_setzero_u8(); - v_uint8x16 max1 = v_setzero_u8(); - for( k = 0; k < N; k++ ) - { - v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); - m0 = v0 < x; - m1 = x < v1; + v_int8x16 c0 = v_setzero_s8(); + v_int8x16 c1 = v_setzero_s8(); + v_uint8x16 max0 = v_setzero_u8(); + v_uint8x16 max1 = v_setzero_u8(); + for( k = 0; k < N; k++ ) + { + v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); + m0 = v0 < x; + m1 = x < v1; - c0 = v_sub_wrap(c0, m0) & m0; - c1 = v_sub_wrap(c1, m1) & m1; + c0 = v_sub_wrap(c0, m0) & m0; + c1 = v_sub_wrap(c1, m1) & m1; - max0 = v_max(max0, v_reinterpret_as_u8(c0)); - max1 = v_max(max1, v_reinterpret_as_u8(c1)); - } + max0 = v_max(max0, v_reinterpret_as_u8(c0)); + max1 = v_max(max1, v_reinterpret_as_u8(c1)); + } - max0 = v_max(max0, max1); - int m = v_signmask(K16 < max0); + max0 = v_max(max0, max1); + int m = v_signmask(K16 < max0); - for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) - { - if(m & 1) + for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) { - cornerpos[ncorners++] = j+k; - if(nonmax_suppression) - curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold); + if(m & 1) + { + cornerpos[ncorners++] = j+k; + if(nonmax_suppression) + curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold); + } } } } diff --git a/modules/features2d/src/fast.hpp b/modules/features2d/src/fast.hpp new file mode 100644 index 0000000000..6f750fea8d --- /dev/null +++ b/modules/features2d/src/fast.hpp @@ -0,0 +1,62 @@ +/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten. + Below is the original copyright and the references */ + +/* +Copyright (c) 2006, 2008 Edward Rosten +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +The references are: + * Machine learning for high-speed corner detection, + E. Rosten and T. Drummond, ECCV 2006 + * Faster and better: A machine learning approach to corner detection + E. Rosten, R. Porter and T. Drummond, PAMI, 2009 +*/ + +#ifndef OPENCV_FEATURES2D_FAST_HPP +#define OPENCV_FEATURES2D_FAST_HPP + +namespace cv +{ +namespace opt_AVX2 +{ +#if CV_TRY_AVX2 +class FAST_t_patternSize16_AVX2 +{ +public: + static Ptr getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel); + virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) = 0; + virtual ~FAST_t_patternSize16_AVX2() {}; +}; +#endif +} +} +#endif