From 9557b9f70f18785d44e3f21155f4f9f8859e7c0f Mon Sep 17 00:00:00 2001 From: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> Date: Thu, 17 Jun 2021 10:14:48 -0700 Subject: [PATCH] Improve SIFT for arm64/Apple silicon - Reduce branch density by collapsing compares. - Fix windows build errors - Use OpenCV universal intrinsics - Use v_check_any and v_signmask as requested --- modules/features2d/src/sift.simd.hpp | 195 ++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 21 deletions(-) diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp index b5033459b9..60129b1535 100644 --- a/modules/features2d/src/sift.simd.hpp +++ b/modules/features2d/src/sift.simd.hpp @@ -450,31 +450,184 @@ public: const sift_wt* currptr = img.ptr(r); const sift_wt* prevptr = prev.ptr(r); const sift_wt* nextptr = next.ptr(r); + int c = SIFT_IMG_BORDER; - for( int c = SIFT_IMG_BORDER; c < cols-SIFT_IMG_BORDER; c++) +#if CV_SIMD && !(DoG_TYPE_SHORT) + const int vecsize = v_float32::nlanes; + for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize) + { + v_float32 val = vx_load(&currptr[c]); + v_float32 _00,_01,_02; + v_float32 _10, _12; + v_float32 _20,_21,_22; + + v_float32 vmin,vmax; + + + v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold); + if (!v_check_any(cond)) + { + continue; + } + + _00 = vx_load(&currptr[c-step-1]); _01 = vx_load(&currptr[c-step]); _02 = vx_load(&currptr[c-step+1]); + _10 = vx_load(&currptr[c -1]); _12 = vx_load(&currptr[c +1]); + _20 = vx_load(&currptr[c+step-1]); _21 = vx_load(&currptr[c+step]); _22 = vx_load(&currptr[c+step+1]); + + vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); + vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); + + v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax); + v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin); + + cond = condp | condm; + if (!v_check_any(cond)) + { + continue; + } + + _00 = vx_load(&prevptr[c-step-1]); _01 = vx_load(&prevptr[c-step]); _02 = vx_load(&prevptr[c-step+1]); + _10 = vx_load(&prevptr[c -1]); _12 = vx_load(&prevptr[c +1]); + _20 = vx_load(&prevptr[c+step-1]); _21 = vx_load(&prevptr[c+step]); _22 = vx_load(&prevptr[c+step+1]); + + vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); + vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); + + condp &= (val >= vmax); + condm &= (val <= vmin); + + cond = condp | condm; + if (!v_check_any(cond)) + { + continue; + } + + v_float32 _11p = vx_load(&prevptr[c]); + v_float32 _11n = vx_load(&nextptr[c]); + + v_float32 max_middle = v_max(_11n,_11p); + v_float32 min_middle = v_min(_11n,_11p); + + _00 = vx_load(&nextptr[c-step-1]); _01 = vx_load(&nextptr[c-step]); _02 = vx_load(&nextptr[c-step+1]); + _10 = vx_load(&nextptr[c -1]); _12 = vx_load(&nextptr[c +1]); + _20 = vx_load(&nextptr[c+step-1]); _21 = vx_load(&nextptr[c+step]); _22 = vx_load(&nextptr[c+step+1]); + + vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); + vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); + + condp &= (val >= v_max(vmax,max_middle)); + condm &= (val <= v_min(vmin,min_middle)); + + cond = condp | condm; + if (!v_check_any(cond)) + { + continue; + } + + int mask = v_signmask(cond); + for (int k = 0; k 0 ? j - 1 : n - 1; + int r2 = j < n-1 ? j + 1 : 0; + + if( hist[j] > hist[l] && hist[j] > hist[r2] && hist[j] >= mag_thr ) + { + float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]); + bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin; + kpt.angle = 360.f - (float)((360.f/n) * bin); + if(std::abs(kpt.angle - 360.f) < FLT_EPSILON) + kpt.angle = 0.f; + + kpts_.push_back(kpt); + } + } + } + } + +#endif //CV_SIMD && !(DoG_TYPE_SHORT) + + // vector loop reminder, better predictibility and less branch density + for( ; c < cols-SIFT_IMG_BORDER; c++) { sift_wt val = currptr[c]; + if (std::abs(val) <= threshold) + continue; + + sift_wt _00,_01,_02; + sift_wt _10, _12; + sift_wt _20,_21,_22; + _00 = currptr[c-step-1]; _01 = currptr[c-step]; _02 = currptr[c-step+1]; + _10 = currptr[c -1]; _12 = currptr[c +1]; + _20 = currptr[c+step-1]; _21 = currptr[c+step]; _22 = currptr[c+step+1]; + + bool calculate = false; + if (val > 0) + { + sift_wt vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22))); + if (val >= vmax) + { + _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1]; + _10 = prevptr[c -1]; _12 = prevptr[c +1]; + _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1]; + vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22))); + if (val >= vmax) + { + _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1]; + _10 = nextptr[c -1]; _12 = nextptr[c +1]; + _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1]; + vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22))); + if (val >= vmax) + { + sift_wt _11p = prevptr[c], _11n = nextptr[c]; + calculate = (val >= std::max(_11p,_11n)); + } + } + } + + } else { // val cant be zero here (first abs took care of zero), must be negative + sift_wt vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22))); + if (val <= vmin) + { + _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1]; + _10 = prevptr[c -1]; _12 = prevptr[c +1]; + _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1]; + vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22))); + if (val <= vmin) + { + _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1]; + _10 = nextptr[c -1]; _12 = nextptr[c +1]; + _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1]; + vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22))); + if (val <= vmin) + { + sift_wt _11p = prevptr[c], _11n = nextptr[c]; + calculate = (val <= std::min(_11p,_11n)); + } + } + } + } - // find local extrema with pixel accuracy - if( std::abs(val) > threshold && - ((val > 0 && val >= currptr[c-1] && val >= currptr[c+1] && - val >= currptr[c-step-1] && val >= currptr[c-step] && val >= currptr[c-step+1] && - val >= currptr[c+step-1] && val >= currptr[c+step] && val >= currptr[c+step+1] && - val >= nextptr[c] && val >= nextptr[c-1] && val >= nextptr[c+1] && - val >= nextptr[c-step-1] && val >= nextptr[c-step] && val >= nextptr[c-step+1] && - val >= nextptr[c+step-1] && val >= nextptr[c+step] && val >= nextptr[c+step+1] && - val >= prevptr[c] && val >= prevptr[c-1] && val >= prevptr[c+1] && - val >= prevptr[c-step-1] && val >= prevptr[c-step] && val >= prevptr[c-step+1] && - val >= prevptr[c+step-1] && val >= prevptr[c+step] && val >= prevptr[c+step+1]) || - (val < 0 && val <= currptr[c-1] && val <= currptr[c+1] && - val <= currptr[c-step-1] && val <= currptr[c-step] && val <= currptr[c-step+1] && - val <= currptr[c+step-1] && val <= currptr[c+step] && val <= currptr[c+step+1] && - val <= nextptr[c] && val <= nextptr[c-1] && val <= nextptr[c+1] && - val <= nextptr[c-step-1] && val <= nextptr[c-step] && val <= nextptr[c-step+1] && - val <= nextptr[c+step-1] && val <= nextptr[c+step] && val <= nextptr[c+step+1] && - val <= prevptr[c] && val <= prevptr[c-1] && val <= prevptr[c+1] && - val <= prevptr[c-step-1] && val <= prevptr[c-step] && val <= prevptr[c-step+1] && - val <= prevptr[c+step-1] && val <= prevptr[c+step] && val <= prevptr[c+step+1]))) + if (calculate) { CV_TRACE_REGION("pixel_candidate");