|
|
|
@ -42,6 +42,7 @@ |
|
|
|
|
|
|
|
|
|
#include "precomp.hpp" |
|
|
|
|
#include "opencl_kernels_imgproc.hpp" |
|
|
|
|
#include "opencv2/core/hal/intrin.hpp" |
|
|
|
|
#include <queue> |
|
|
|
|
|
|
|
|
|
#ifdef _MSC_VER |
|
|
|
@ -299,8 +300,8 @@ public: |
|
|
|
|
|
|
|
|
|
void operator()(const Range &boundaries) const |
|
|
|
|
{ |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
const int type = src.type(), cn = CV_MAT_CN(type); |
|
|
|
@ -409,39 +410,28 @@ public: |
|
|
|
|
if (!L2gradient) |
|
|
|
|
{ |
|
|
|
|
int j = 0, width = src.cols * cn; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); |
|
|
|
|
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); |
|
|
|
|
v_int16x8 v_dx = v_load((const short *)(_dx + j)); |
|
|
|
|
v_int16x8 v_dy = v_load((const short *)(_dy + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_dx_abs = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx)); |
|
|
|
|
__m128i v_dy_abs = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy)); |
|
|
|
|
v_dx = v_reinterpret_as_s16(v_abs(v_dx)); |
|
|
|
|
v_dy = v_reinterpret_as_s16(v_abs(v_dy)); |
|
|
|
|
|
|
|
|
|
__m128i v_dx_ml = _mm_unpacklo_epi16(v_dx_abs, v_zero); |
|
|
|
|
__m128i v_dy_ml = _mm_unpacklo_epi16(v_dy_abs, v_zero); |
|
|
|
|
__m128i v_dx_mh = _mm_unpackhi_epi16(v_dx_abs, v_zero); |
|
|
|
|
__m128i v_dy_mh = _mm_unpackhi_epi16(v_dy_abs, v_zero); |
|
|
|
|
v_int32x4 v_dx_ml; |
|
|
|
|
v_int32x4 v_dy_ml; |
|
|
|
|
v_int32x4 v_dx_mh; |
|
|
|
|
v_int32x4 v_dy_mh; |
|
|
|
|
v_expand(v_dx, v_dx_ml, v_dx_mh); |
|
|
|
|
v_expand(v_dy, v_dy_ml, v_dy_mh); |
|
|
|
|
|
|
|
|
|
__m128i v_norm_ml = _mm_add_epi32(v_dx_ml, v_dy_ml); |
|
|
|
|
__m128i v_norm_mh = _mm_add_epi32(v_dx_mh, v_dy_mh); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); |
|
|
|
|
v_store((int *)(_norm + j), v_dx_ml + v_dy_ml); |
|
|
|
|
v_store((int *)(_norm + j + 4), v_dx_mh + v_dy_mh); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); |
|
|
|
|
vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), |
|
|
|
|
vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); |
|
|
|
|
vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), |
|
|
|
|
vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for ( ; j < width; ++j) |
|
|
|
|
_norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); |
|
|
|
@ -449,36 +439,23 @@ public: |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
int j = 0, width = src.cols * cn; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); |
|
|
|
|
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy); |
|
|
|
|
__m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy); |
|
|
|
|
v_int16x8 v_dx = v_load((const short*)(_dx + j)); |
|
|
|
|
v_int16x8 v_dy = v_load((const short*)(_dy + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml); |
|
|
|
|
__m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh); |
|
|
|
|
v_int32x4 v_dxp_low, v_dxp_high; |
|
|
|
|
v_int32x4 v_dyp_low, v_dyp_high; |
|
|
|
|
v_expand(v_dx, v_dxp_low, v_dxp_high); |
|
|
|
|
v_expand(v_dy, v_dyp_low, v_dyp_high); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); |
|
|
|
|
v_store((int *)(_norm + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); |
|
|
|
|
v_store((int *)(_norm + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); |
|
|
|
|
int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); |
|
|
|
|
int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); |
|
|
|
|
vst1q_s32(_norm + j, v_dst); |
|
|
|
|
|
|
|
|
|
v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); |
|
|
|
|
v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); |
|
|
|
|
vst1q_s32(_norm + j + 4, v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for ( ; j < width; ++j) |
|
|
|
|
_norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; |
|
|
|
@ -529,30 +506,31 @@ public: |
|
|
|
|
const int TG22 = (int)(0.4142135623730950488016887242097*(1 << CANNY_SHIFT) + 0.5); |
|
|
|
|
|
|
|
|
|
int prev_flag = 0, j = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (checkHardwareSupport(CPU_SSE2)) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
__m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1); |
|
|
|
|
v_int32x4 v_low = v_setall_s32(low); |
|
|
|
|
v_int8x16 v_one = v_setall_s8(1); |
|
|
|
|
|
|
|
|
|
for (; j <= src.cols - 16; j += 16) |
|
|
|
|
{ |
|
|
|
|
__m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j)); |
|
|
|
|
__m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4)); |
|
|
|
|
__m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8)); |
|
|
|
|
__m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12)); |
|
|
|
|
v_int32x4 v_m1 = v_load((const int*)(_mag + j)); |
|
|
|
|
v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4)); |
|
|
|
|
v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8)); |
|
|
|
|
v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12)); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(_map + j), v_one); |
|
|
|
|
v_store((signed char*)(_map + j), v_one); |
|
|
|
|
|
|
|
|
|
__m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low); |
|
|
|
|
__m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low); |
|
|
|
|
__m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low); |
|
|
|
|
__m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low); |
|
|
|
|
v_int32x4 v_cmp1 = v_m1 > v_low; |
|
|
|
|
v_int32x4 v_cmp2 = v_m2 > v_low; |
|
|
|
|
v_int32x4 v_cmp3 = v_m3 > v_low; |
|
|
|
|
v_int32x4 v_cmp4 = v_m4 > v_low; |
|
|
|
|
|
|
|
|
|
v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2); |
|
|
|
|
v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4); |
|
|
|
|
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); |
|
|
|
|
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); |
|
|
|
|
|
|
|
|
|
v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2); |
|
|
|
|
unsigned int mask = _mm_movemask_epi8(v_cmp1); |
|
|
|
|
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); |
|
|
|
|
unsigned int mask = v_signmask(v_cmp); |
|
|
|
|
|
|
|
|
|
if (mask) |
|
|
|
|
{ |
|
|
|
@ -730,54 +708,57 @@ public: |
|
|
|
|
const uchar* pmap = map + mapstep + 1 + (ptrdiff_t)(mapstep * boundaries.start); |
|
|
|
|
uchar* pdst = dst.ptr() + (ptrdiff_t)(dst.step * boundaries.start); |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for (int i = boundaries.start; i < boundaries.end; i++, pmap += mapstep, pdst += dst.step) |
|
|
|
|
{ |
|
|
|
|
int j = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(haveSSE2) { |
|
|
|
|
const __m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if(haveSIMD) { |
|
|
|
|
const v_int8x16 v_zero = v_setzero_s8(); |
|
|
|
|
|
|
|
|
|
for(; j <= dst.cols - 32; j += 32) { |
|
|
|
|
__m128i v_pmap1 = _mm_loadu_si128((const __m128i*)(pmap + j)); |
|
|
|
|
__m128i v_pmap2 = _mm_loadu_si128((const __m128i*)(pmap + j + 16)); |
|
|
|
|
v_uint8x16 v_pmap1 = v_load((const unsigned char*)(pmap + j)); |
|
|
|
|
v_uint8x16 v_pmap2 = v_load((const unsigned char*)(pmap + j + 16)); |
|
|
|
|
|
|
|
|
|
__m128i v_pmaplo1 = _mm_unpacklo_epi8(v_pmap1, v_zero); |
|
|
|
|
__m128i v_pmaphi1 = _mm_unpackhi_epi8(v_pmap1, v_zero); |
|
|
|
|
__m128i v_pmaplo2 = _mm_unpacklo_epi8(v_pmap2, v_zero); |
|
|
|
|
__m128i v_pmaphi2 = _mm_unpackhi_epi8(v_pmap2, v_zero); |
|
|
|
|
v_uint16x8 v_pmaplo1; |
|
|
|
|
v_uint16x8 v_pmaphi1; |
|
|
|
|
v_uint16x8 v_pmaplo2; |
|
|
|
|
v_uint16x8 v_pmaphi2; |
|
|
|
|
v_expand(v_pmap1, v_pmaplo1, v_pmaphi1); |
|
|
|
|
v_expand(v_pmap2, v_pmaplo2, v_pmaphi2); |
|
|
|
|
|
|
|
|
|
v_pmaplo1 = _mm_srli_epi16(v_pmaplo1, 1); |
|
|
|
|
v_pmaphi1 = _mm_srli_epi16(v_pmaphi1, 1); |
|
|
|
|
v_pmaplo2 = _mm_srli_epi16(v_pmaplo2, 1); |
|
|
|
|
v_pmaphi2 = _mm_srli_epi16(v_pmaphi2, 1); |
|
|
|
|
v_pmaplo1 = v_pmaplo1 >> 1; |
|
|
|
|
v_pmaphi1 = v_pmaphi1 >> 1; |
|
|
|
|
v_pmaplo2 = v_pmaplo2 >> 1; |
|
|
|
|
v_pmaphi2 = v_pmaphi2 >> 1; |
|
|
|
|
|
|
|
|
|
v_pmap1 = _mm_packus_epi16(v_pmaplo1, v_pmaphi1); |
|
|
|
|
v_pmap2 = _mm_packus_epi16(v_pmaplo2, v_pmaphi2); |
|
|
|
|
v_pmap1 = v_pack(v_pmaplo1, v_pmaphi1); |
|
|
|
|
v_pmap2 = v_pack(v_pmaplo2, v_pmaphi2); |
|
|
|
|
|
|
|
|
|
v_pmap1 = _mm_sub_epi8(v_zero, v_pmap1); |
|
|
|
|
v_pmap2 = _mm_sub_epi8(v_zero, v_pmap2); |
|
|
|
|
v_pmap1 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap1)); |
|
|
|
|
v_pmap2 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap2)); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(pdst + j), v_pmap1); |
|
|
|
|
_mm_storeu_si128((__m128i*)(pdst + j + 16), v_pmap2); |
|
|
|
|
v_store((pdst + j), v_pmap1); |
|
|
|
|
v_store((pdst + j + 16), v_pmap2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for(; j <= dst.cols - 16; j += 16) { |
|
|
|
|
__m128i v_pmap = _mm_loadu_si128((const __m128i*)(pmap + j)); |
|
|
|
|
v_uint8x16 v_pmap = v_load((const unsigned char*)(pmap + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_pmaplo = _mm_unpacklo_epi8(v_pmap, v_zero); |
|
|
|
|
__m128i v_pmaphi = _mm_unpackhi_epi8(v_pmap, v_zero); |
|
|
|
|
v_uint16x8 v_pmaplo; |
|
|
|
|
v_uint16x8 v_pmaphi; |
|
|
|
|
v_expand(v_pmap, v_pmaplo, v_pmaphi); |
|
|
|
|
|
|
|
|
|
v_pmaplo = _mm_srli_epi16(v_pmaplo, 1); |
|
|
|
|
v_pmaphi = _mm_srli_epi16(v_pmaphi, 1); |
|
|
|
|
v_pmaplo = v_pmaplo >> 1; |
|
|
|
|
v_pmaphi = v_pmaphi >> 1; |
|
|
|
|
|
|
|
|
|
v_pmap = _mm_packus_epi16(v_pmaplo, v_pmaphi); |
|
|
|
|
v_pmap = _mm_sub_epi8(v_zero, v_pmap); |
|
|
|
|
v_pmap = v_pack(v_pmaplo, v_pmaphi); |
|
|
|
|
v_pmap = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap)); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(pdst + j), v_pmap); |
|
|
|
|
v_store((pdst + j), v_pmap); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
@ -980,8 +961,8 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, |
|
|
|
|
#define CANNY_PUSH(d) *(d) = uchar(2), *stack_top++ = (d) |
|
|
|
|
#define CANNY_POP(d) (d) = *--stack_top |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
// calculate magnitude and angle of gradient, perform non-maxima suppression.
|
|
|
|
@ -1000,32 +981,26 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, |
|
|
|
|
if (!L2gradient) |
|
|
|
|
{ |
|
|
|
|
int j = 0, width = cols * cn; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); |
|
|
|
|
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); |
|
|
|
|
v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx)); |
|
|
|
|
v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy)); |
|
|
|
|
v_int16x8 v_dx = v_load((const short*)(_dx + j)); |
|
|
|
|
v_int16x8 v_dy = v_load((const short*)(_dy + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero)); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j), v_norm); |
|
|
|
|
v_int32x4 v_dx0, v_dx1, v_dy0, v_dy1; |
|
|
|
|
v_expand(v_dx, v_dx0, v_dx1); |
|
|
|
|
v_expand(v_dy, v_dy0, v_dy1); |
|
|
|
|
|
|
|
|
|
v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero)); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm); |
|
|
|
|
} |
|
|
|
|
v_dx0 = v_reinterpret_as_s32(v_abs(v_dx0)); |
|
|
|
|
v_dx1 = v_reinterpret_as_s32(v_abs(v_dx1)); |
|
|
|
|
v_dy0 = v_reinterpret_as_s32(v_abs(v_dy0)); |
|
|
|
|
v_dy1 = v_reinterpret_as_s32(v_abs(v_dy1)); |
|
|
|
|
|
|
|
|
|
v_store(_norm + j, v_dx0 + v_dy0); |
|
|
|
|
v_store(_norm + j + 4, v_dx1 + v_dy1); |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); |
|
|
|
|
vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), |
|
|
|
|
vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); |
|
|
|
|
vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), |
|
|
|
|
vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for ( ; j < width; ++j) |
|
|
|
@ -1034,33 +1009,23 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
int j = 0, width = cols * cn; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSSE2) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); |
|
|
|
|
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); |
|
|
|
|
v_int16x8 v_dx = v_load((const short*)(_dx + j)); |
|
|
|
|
v_int16x8 v_dy = v_load((const short*)(_dy + j)); |
|
|
|
|
|
|
|
|
|
__m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy); |
|
|
|
|
__m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy); |
|
|
|
|
__m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml); |
|
|
|
|
__m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml); |
|
|
|
|
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
for ( ; j <= width - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); |
|
|
|
|
int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); |
|
|
|
|
int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); |
|
|
|
|
vst1q_s32(_norm + j, v_dst); |
|
|
|
|
v_int16x8 v_dx_dy0, v_dx_dy1; |
|
|
|
|
v_zip(v_dx, v_dy, v_dx_dy0, v_dx_dy1); |
|
|
|
|
|
|
|
|
|
v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); |
|
|
|
|
v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); |
|
|
|
|
vst1q_s32(_norm + j + 4, v_dst); |
|
|
|
|
v_int32x4 v_dst0 = v_dotprod(v_dx_dy0, v_dx_dy0); |
|
|
|
|
v_int32x4 v_dst1 = v_dotprod(v_dx_dy1, v_dx_dy1); |
|
|
|
|
|
|
|
|
|
v_store(_norm + j, v_dst0); |
|
|
|
|
v_store(_norm + j + 4, v_dst1); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for ( ; j < width; ++j) |
|
|
|
@ -1112,30 +1077,31 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, |
|
|
|
|
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); |
|
|
|
|
|
|
|
|
|
int prev_flag = 0, j = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (checkHardwareSupport(CPU_SSE2)) |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
__m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1); |
|
|
|
|
v_int32x4 v_low = v_setall_s32(low); |
|
|
|
|
v_int8x16 v_one = v_setall_s8(1); |
|
|
|
|
|
|
|
|
|
for (; j <= cols - 16; j += 16) |
|
|
|
|
{ |
|
|
|
|
__m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j)); |
|
|
|
|
__m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4)); |
|
|
|
|
__m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8)); |
|
|
|
|
__m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12)); |
|
|
|
|
v_int32x4 v_m1 = v_load((const int*)(_mag + j)); |
|
|
|
|
v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4)); |
|
|
|
|
v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8)); |
|
|
|
|
v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12)); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(_map + j), v_one); |
|
|
|
|
v_store((signed char*)(_map + j), v_one); |
|
|
|
|
|
|
|
|
|
__m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low); |
|
|
|
|
__m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low); |
|
|
|
|
__m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low); |
|
|
|
|
__m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low); |
|
|
|
|
v_int32x4 v_cmp1 = v_m1 > v_low; |
|
|
|
|
v_int32x4 v_cmp2 = v_m2 > v_low; |
|
|
|
|
v_int32x4 v_cmp3 = v_m3 > v_low; |
|
|
|
|
v_int32x4 v_cmp4 = v_m4 > v_low; |
|
|
|
|
|
|
|
|
|
v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2); |
|
|
|
|
v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4); |
|
|
|
|
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); |
|
|
|
|
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); |
|
|
|
|
|
|
|
|
|
v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2); |
|
|
|
|
unsigned int mask = _mm_movemask_epi8(v_cmp1); |
|
|
|
|
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); |
|
|
|
|
unsigned int mask = v_signmask(v_cmp); |
|
|
|
|
|
|
|
|
|
if (mask) |
|
|
|
|
{ |
|
|
|
|