diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 49a9cceaec..fb69cd201a 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128); #if CV_SSE2 FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore128Aligned); -#if CV_AVX +#if CV_AVX2 FUNCTOR_TEMPLATE(VLoadStore256); FUNCTOR_TEMPLATE(VLoadStore256Aligned); #endif @@ -2626,10 +2626,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2672,10 +2678,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2718,10 +2730,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2763,10 +2781,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2806,10 +2830,16 @@ struct Div_SIMD template <> struct Div_SIMD { + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2902,10 +2932,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2941,10 +2977,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const schar * src2, schar * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -2980,10 +3022,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -3018,10 +3066,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const short * src2, short * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -3054,10 +3108,16 @@ struct Recip_SIMD template <> struct Recip_SIMD { + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + int operator() (const int * src2, int * dst, int width, double scale) const { int x = 0; + if (!haveSIMD) + return x; + __m128d v_scale = _mm_set1_pd(scale); __m128i v_zero = _mm_setzero_si128(); @@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x =0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); __m128i c128 = _mm_set1_epi8 (-128); for( ; x <= size.width - 16; x += 16 ) @@ -4142,7 +4203,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4174,7 +4236,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste _mm_storeu_si128((__m128i*)(dst + x), r00); } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x =0; #if CV_SSE2 - if( USE_SSE2){// + if( USE_SSE2) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4278,7 +4341,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4293,8 +4356,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - - #endif + #endif for( ; x < size.width; x++ ){ dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); @@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -4332,7 +4395,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -4347,8 +4410,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - #endif - for( ; x < size.width; x++ ) + #endif + for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 626a666a95..090acf5508 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -158,7 +158,7 @@ struct VSplit2 \ VSplit2() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, \ @@ -191,7 +191,7 @@ struct VSplit3 \ VSplit3() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, \ @@ -229,7 +229,7 @@ struct VSplit4 \ VSplit4() \ { \ - support = true; \ + support = checkHardwareSupport(CV_CPU_SSE2); \ } \ \ void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ @@ -502,7 +502,7 @@ struct VMerge4 bool support; }; -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge2 \ { \ @@ -513,7 +513,7 @@ struct VMerge2 \ VMerge2() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, \ @@ -535,7 +535,7 @@ struct VMerge2 bool support; \ } -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge3 \ { \ @@ -546,7 +546,7 @@ struct VMerge3 \ VMerge3() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ @@ -573,7 +573,7 @@ struct VMerge3 bool support; \ } -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \ +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ template <> \ struct VMerge4 \ { \ @@ -584,7 +584,7 @@ struct VMerge4 \ VMerge4() \ { \ - support = true; \ + support = checkHardwareSupport(se); \ } \ \ void operator()(const data_type * src0, const data_type * src1, \ @@ -616,19 +616,19 @@ struct VMerge4 bool support; \ } -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); #if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); #endif -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); #endif @@ -4404,6 +4404,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4430,6 +4433,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4454,10 +4460,16 @@ struct Cvt_SIMD template <> struct Cvt_SIMD { + bool haveSIMD; + Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + int operator() (const double * src, ushort * dst, int width) const { int x = 0; + if (!haveSIMD) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4486,6 +4498,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 8; x += 8) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4512,6 +4527,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 4; x += 4) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -4532,6 +4550,9 @@ struct Cvt_SIMD { int x = 0; + if (!USE_SSE2) + return x; + for ( ; x <= width - 4; x += 4) { __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); @@ -5114,8 +5135,9 @@ cvt_( const float* src, size_t sstep, { int x = 0; #if CV_SSE2 - if(USE_SSE2){ - for( ; x <= size.width - 8; x += 8 ) + if(USE_SSE2) + { + for( ; x <= size.width - 8; x += 8 ) { __m128 src128 = _mm_loadu_ps (src + x); __m128i src_int128 = _mm_cvtps_epi32 (src128); diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index d7f9dc5379..13ada1d1d6 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -597,15 +597,18 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } } #endif @@ -619,11 +622,14 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } } #endif @@ -728,15 +734,18 @@ void cartToPolar( InputArray src1, InputArray src2, k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } } #endif @@ -750,11 +759,14 @@ void cartToPolar( InputArray src1, InputArray src2, k = 0; #if CV_SSE2 - for ( ; k <= len - 4; k += 4) + if (USE_SSE2) { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } } #endif @@ -832,17 +844,16 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, k1 = N/360.; #if CV_AVX2 - __m128d v_i = _mm_set_pd(1, 0); - __m128d v_k1 = _mm_set1_pd(k1); - __m128d v_1 = _mm_set1_pd(1); - __m128i v_N1 = _mm_set1_epi32(N - 1); - __m128i v_N4 = _mm_set1_epi32(N >> 2); - __m128d v_sin_a0 = _mm_set1_pd(sin_a0); - __m128d v_sin_a2 = _mm_set1_pd(sin_a2); - __m128d v_cos_a0 = _mm_set1_pd(cos_a0); - if (USE_AVX2) { + __m128d v_k1 = _mm_set1_pd(k1); + __m128d v_1 = _mm_set1_pd(1); + __m128i v_N1 = _mm_set1_epi32(N - 1); + __m128i v_N4 = _mm_set1_epi32(N >> 2); + __m128d v_sin_a0 = _mm_set1_pd(sin_a0); + __m128d v_sin_a2 = _mm_set1_pd(sin_a2); + __m128d v_cos_a0 = _mm_set1_pd(cos_a0); + for ( ; i <= len - 4; i += 4) { __m128 v_angle = _mm_loadu_ps(angle + i); @@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); - __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), _mm_mul_pd(v_cos_a, v_sin_b)); @@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, _mm_mul_pd(v_sin_a, v_sin_b)); // 2-3 - v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1); + v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1); v_it = _mm_cvtpd_epi32(v_t); v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); @@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); - v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); + v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), _mm_mul_pd(v_cos_a, v_sin_b)); @@ -1032,11 +1043,14 @@ void polarToCart( InputArray src1, InputArray src2, vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); } #elif CV_SSE2 - for( ; k <= len - 4; k += 4 ) + if (USE_SSE2) { - __m128 v_m = _mm_loadu_ps(mag + k); - _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); - _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + for( ; k <= len - 4; k += 4 ) + { + __m128 v_m = _mm_loadu_ps(mag + k); + _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); + _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + } } #endif @@ -1063,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2, x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; } else - for( k = 0; k < len; k++ ) - { - x[k] = buf[0][k]; y[k] = buf[1][k]; - } + { + std::memcpy(x, buf[0], sizeof(float) * len); + std::memcpy(y, buf[1], sizeof(float) * len); + } } if( ptrs[0] ) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 4eb17d6a14..1fcb9b54d1 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len ) return nz; } +#if CV_SSE2 + static const uchar * initPopcountTable() { static uchar tab[256]; @@ -425,6 +427,8 @@ static const uchar * initPopcountTable() return tab; } +#endif + static int countNonZero8u( const uchar* src, int len ) { int i=0, nz = 0; @@ -645,7 +649,7 @@ static int countNonZero32f( const float* src, int len ) } static int countNonZero64f( const double* src, int len ) -{ +{ int i = 0, nz = 0; #if CV_SSE2 if (USE_SSE2) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 4efbcc5f8b..5ae1170b43 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -967,6 +967,7 @@ struct Gray2RGB5x5 v_n7 = vdup_n_u8(~7); v_n3 = vdup_n_u8(~3); #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); v_n7 = _mm_set1_epi16(~7); v_n3 = _mm_set1_epi16(~3); v_zero = _mm_setzero_si128(); @@ -988,21 +989,24 @@ struct Gray2RGB5x5 vst1q_u16((ushort *)dst + i, v_dst); } #elif CV_SSE2 - for ( ; i <= n - 16; i += 16 ) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + for ( ; i <= n - 16; i += 16 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); - __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - v_src_p = _mm_unpackhi_epi8(v_src, v_zero); - v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } } #endif for ( ; i < n; i++ ) @@ -1021,21 +1025,24 @@ struct Gray2RGB5x5 vst1q_u16((ushort *)dst + i, v_dst); } #elif CV_SSE2 - for ( ; i <= n - 16; i += 8 ) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - - __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); - __m128i v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi32(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - - v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); - v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi16(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + for ( ; i <= n - 16; i += 8 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + + __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); + __m128i v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi32(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + + v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); + v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi16(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } } #endif for( ; i < n; i++ ) @@ -1051,6 +1058,7 @@ struct Gray2RGB5x5 uint8x8_t v_n7, v_n3; #elif CV_SSE2 __m128i v_n7, v_n3, v_zero; + bool haveSIMD; #endif }; @@ -1084,6 +1092,7 @@ struct RGB5x52Gray v_f8 = vdupq_n_u16(0xf8); v_fc = vdupq_n_u16(0xfc); #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); v_b2y = _mm_set1_epi16(B2Y); v_g2y = _mm_set1_epi16(G2Y); v_r2y = _mm_set1_epi16(R2Y); @@ -1116,37 +1125,40 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } #elif CV_SSE2 - __m128i v_zero = _mm_setzero_si128(); - - for ( ; i <= n - 8; i += 8) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), - v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), - v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); - - __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); - __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); - __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); - __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); - __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); - __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); - - __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), - _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); - v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), - _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_zero = _mm_setzero_si128(); - __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), - _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); - v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), - _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); - - v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); - v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); - - __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); - _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } } #endif for ( ; i < n; i++) @@ -1177,37 +1189,40 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } #elif CV_SSE2 - __m128i v_zero = _mm_setzero_si128(); - - for ( ; i <= n - 8; i += 8) + if (haveSIMD) { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), - v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), - v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); - - __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); - __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); - __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); - __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); - __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); - __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); - - __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), - _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); - v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), - _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + __m128i v_zero = _mm_setzero_si128(); - __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), - _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); - v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), - _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); - - v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); - v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); - - __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); - _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } } #endif for ( ; i < n; i++) @@ -1226,6 +1241,7 @@ struct RGB5x52Gray uint32x4_t v_delta; uint16x8_t v_f8, v_fc; #elif CV_SSE2 + bool haveSIMD; __m128i v_b2y, v_g2y, v_r2y; __m128i v_delta; __m128i v_f8, v_fc; @@ -1445,7 +1461,9 @@ struct RGB2Gray float32x4_t v_cb, v_cg, v_cr; }; -#elif CV_SSE4_1 +#elif CV_SSE2 + +#if CV_SSE4_1 template <> struct RGB2Gray @@ -1464,6 +1482,8 @@ struct RGB2Gray v_cg = _mm_set1_epi16((short)coeffs[1]); v_cr = _mm_set1_epi16((short)coeffs[2]); v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16s x 8 @@ -1494,7 +1514,7 @@ struct RGB2Gray { int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; i <= n - 16; i += 16, src += scn * 16) { @@ -1519,7 +1539,7 @@ struct RGB2Gray _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); } } - else if (scn == 4) + else if (scn == 4 && haveSIMD) { for ( ; i <= n - 16; i += 16, src += scn * 16) { @@ -1554,8 +1574,11 @@ struct RGB2Gray int srccn, coeffs[3]; __m128i v_cb, v_cg, v_cr; __m128i v_delta; + bool haveSIMD; }; +#endif // CV_SSE4_1 + template <> struct RGB2Gray { @@ -1571,6 +1594,8 @@ struct RGB2Gray v_cb = _mm_set1_ps(coeffs[0]); v_cg = _mm_set1_ps(coeffs[1]); v_cr = _mm_set1_ps(coeffs[2]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -1586,7 +1611,7 @@ struct RGB2Gray int scn = srccn, i = 0; float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; i <= n - 8; i += 8, src += scn * 8) { @@ -1611,7 +1636,7 @@ struct RGB2Gray _mm_storeu_ps(dst + i + 4, v_gray1); } } - else if (scn == 4) + else if (scn == 4 && haveSIMD) { for ( ; i <= n - 8; i += 8, src += scn * 8) { @@ -1646,6 +1671,7 @@ struct RGB2Gray int srccn; float coeffs[3]; __m128 v_cb, v_cg, v_cr; + bool haveSIMD; }; #else @@ -1791,6 +1817,8 @@ struct RGB2YCrCb_f v_c3 = _mm_set1_ps(coeffs[3]); v_c4 = _mm_set1_ps(coeffs[4]); v_delta = _mm_set1_ps(ColorChannel::half()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -1811,7 +1839,7 @@ struct RGB2YCrCb_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, src += 8 * scn) { @@ -1862,6 +1890,7 @@ struct RGB2YCrCb_f int srccn, blueIdx; float coeffs[5]; __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; + bool haveSIMD; }; #endif @@ -2138,6 +2167,8 @@ struct RGB2YCrCb_i v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); v_delta = _mm_add_epi32(v_delta, v_delta2); v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16u x 8 @@ -2184,7 +2215,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 96; i += 96, src += scn * 32) { @@ -2261,6 +2292,7 @@ struct RGB2YCrCb_i __m128i v_c0, v_c1, v_c2; __m128i v_c3, v_c4, v_delta, v_delta2; __m128i v_zero; + bool haveSIMD; }; template <> @@ -2285,6 +2317,8 @@ struct RGB2YCrCb_i v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); v_delta = _mm_add_epi32(v_delta, v_delta2); v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } // 16u x 8 @@ -2331,7 +2365,7 @@ struct RGB2YCrCb_i int delta = ColorChannel::half()*(1 << yuv_shift); n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 48; i += 48, src += scn * 16) { @@ -2387,6 +2421,7 @@ struct RGB2YCrCb_i __m128i v_c0, v_c1, v_c2; __m128i v_c3, v_c4, v_delta, v_delta2; __m128i v_zero; + bool haveSIMD; }; #endif // CV_SSE4_1 @@ -2518,6 +2553,8 @@ struct YCrCb2RGB_f v_c3 = _mm_set1_ps(coeffs[3]); v_delta = _mm_set1_ps(ColorChannel::half()); v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_y, __m128 v_cr, __m128 v_cb, @@ -2545,7 +2582,7 @@ struct YCrCb2RGB_f float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if (dcn == 3 || dcn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { @@ -2606,6 +2643,7 @@ struct YCrCb2RGB_f float coeffs[4]; __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; + bool haveSIMD; }; #endif @@ -2920,6 +2958,7 @@ struct YCrCb2RGB_i v_alpha = _mm_set1_epi8(*(char *)&alpha); useSSE = coeffs[0] <= std::numeric_limits::max(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } // 16s x 8 @@ -2975,7 +3014,7 @@ struct YCrCb2RGB_i int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; n *= 3; - if ((dcn == 3 || dcn == 4) && useSSE) + if (haveSIMD && useSSE) { for ( ; i <= n - 96; i += 96, dst += dcn * 32) { @@ -3066,7 +3105,7 @@ struct YCrCb2RGB_i } int dstcn, blueIdx; int coeffs[4]; - bool useSSE; + bool useSSE, haveSIMD; __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; __m128i v_delta, v_alpha, v_zero; @@ -3221,6 +3260,8 @@ struct RGB2XYZ_f v_c6 = _mm_set1_ps(coeffs[6]); v_c7 = _mm_set1_ps(coeffs[7]); v_c8 = _mm_set1_ps(coeffs[8]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_r, __m128 v_g, __m128 v_b, @@ -3248,7 +3289,7 @@ struct RGB2XYZ_f n *= 3; - if (scn == 3 || scn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, src += 8 * scn) { @@ -3301,6 +3342,7 @@ struct RGB2XYZ_f int srccn; float coeffs[9]; __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + bool haveSIMD; }; @@ -3657,6 +3699,8 @@ struct XYZ2RGB_f v_c8 = _mm_set1_ps(coeffs[8]); v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } void process(__m128 v_x, __m128 v_y, __m128 v_z, @@ -3685,7 +3729,7 @@ struct XYZ2RGB_f n *= 3; int i = 0; - if (dcn == 3 || dcn == 4) + if (haveSIMD) { for ( ; i <= n - 24; i += 24, dst += 8 * dcn) { @@ -3745,6 +3789,7 @@ struct XYZ2RGB_f __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; __m128 v_alpha; + bool haveSIMD; }; #endif // CV_SSE2 @@ -4267,6 +4312,7 @@ struct HSV2RGB_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.0f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4331,36 +4377,39 @@ struct HSV2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); - - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif @@ -4403,7 +4452,7 @@ struct HSV2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -4445,6 +4494,7 @@ struct HSV2RGB_b #elif CV_SSE2 __m128 v_scale_inv, v_scale; __m128i v_zero; + bool haveSIMD; #endif }; @@ -4520,6 +4570,7 @@ struct RGB2HLS_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4589,7 +4640,7 @@ struct RGB2HLS_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, src += 16) { @@ -4633,38 +4684,41 @@ struct RGB2HLS_b vst3_u8(dst + j, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_h_0, v_l_0, v_s_0; - process(buf + j, - v_h_0, v_l_0, v_s_0); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_h_0, v_l_0, v_s_0; + process(buf + j, + v_h_0, v_l_0, v_s_0); - __m128i v_h_1, v_l_1, v_s_1; - process(buf + j + 24, - v_h_1, v_l_1, v_s_1); + __m128i v_h_1, v_l_1, v_s_1; + process(buf + j + 24, + v_h_1, v_l_1, v_s_1); - __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); - __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); + __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); - process(buf + j + 48, - v_h_0, v_l_0, v_s_0); + process(buf + j + 48, + v_h_0, v_l_0, v_s_0); - process(buf + j + 72, - v_h_1, v_l_1, v_s_1); + process(buf + j + 72, + v_h_1, v_l_1, v_s_1); - __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); - __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); + __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); - _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); - _mm_storeu_si128((__m128i *)(dst + j), v_h0); - _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); - _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); - _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); - _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); - _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + _mm_storeu_si128((__m128i *)(dst + j), v_h0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + } } #endif for( ; j < dn*3; j += 3 ) @@ -4684,6 +4738,7 @@ struct RGB2HLS_b #elif CV_SSE2 __m128 v_scale, v_scale_inv; __m128i v_zero; + bool haveSIMD; #endif }; @@ -4767,6 +4822,7 @@ struct HLS2RGB_b v_scale_inv = _mm_set1_ps(1.f/255.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -4831,36 +4887,39 @@ struct HLS2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); - - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif for( ; j < dn*3; j += 3 ) @@ -4902,7 +4961,7 @@ struct HLS2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -4944,6 +5003,7 @@ struct HLS2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5264,6 +5324,7 @@ struct Lab2RGB_b v_scale = _mm_set1_ps(255.f); v_128 = _mm_set1_ps(128.0f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5330,36 +5391,39 @@ struct Lab2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); - - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif @@ -5402,7 +5466,7 @@ struct Lab2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -5445,6 +5509,7 @@ struct Lab2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_128; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5627,6 +5692,7 @@ struct RGB2Luv_b v_coeff2 = _mm_set1_ps(96.525423728813564f); v_coeff3 = _mm_set1_ps(0.9732824427480916f); v_coeff4 = _mm_set1_ps(136.259541984732824f); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5698,7 +5764,7 @@ struct RGB2Luv_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - if (scn == 3) + if (scn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, src += 16) { @@ -5743,38 +5809,41 @@ struct RGB2Luv_b vst3_u8(dst + j, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_l_0, v_u_0, v_v_0; - process(buf + j, - v_l_0, v_u_0, v_v_0); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_l_0, v_u_0, v_v_0; + process(buf + j, + v_l_0, v_u_0, v_v_0); - __m128i v_l_1, v_u_1, v_v_1; - process(buf + j + 24, - v_l_1, v_u_1, v_v_1); + __m128i v_l_1, v_u_1, v_v_1; + process(buf + j + 24, + v_l_1, v_u_1, v_v_1); - __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); - __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); - process(buf + j + 48, - v_l_0, v_u_0, v_v_0); + process(buf + j + 48, + v_l_0, v_u_0, v_v_0); - process(buf + j + 72, - v_l_1, v_u_1, v_v_1); + process(buf + j + 72, + v_l_1, v_u_1, v_v_1); - __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); - __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); - __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); - _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); - _mm_storeu_si128((__m128i *)(dst + j), v_l0); - _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); - _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); - _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); - _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); - _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + _mm_storeu_si128((__m128i *)(dst + j), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + } } #endif @@ -5796,6 +5865,7 @@ struct RGB2Luv_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; __m128i v_zero; + bool haveSIMD; #endif }; @@ -5824,6 +5894,7 @@ struct Luv2RGB_b v_140 = _mm_set1_ps(140.f); v_scale = _mm_set1_ps(255.f); v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } @@ -5847,7 +5918,7 @@ struct Luv2RGB_b v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134); v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); - + _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); _mm_store_ps(buf, v_l0); @@ -5890,36 +5961,39 @@ struct Luv2RGB_b vst3q_f32(buf + j + 12, v_dst); } #elif CV_SSE2 - for ( ; j <= (dn - 32) * 3; j += 96) + if (haveSIMD) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - - _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - process(_mm_unpacklo_epi8(v_r0, v_zero), - _mm_unpacklo_epi8(v_g0, v_zero), - _mm_unpacklo_epi8(v_b0, v_zero), - buf + j); - - process(_mm_unpackhi_epi8(v_r0, v_zero), - _mm_unpackhi_epi8(v_g0, v_zero), - _mm_unpackhi_epi8(v_b0, v_zero), - buf + j + 24); + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); - process(_mm_unpacklo_epi8(v_r1, v_zero), - _mm_unpacklo_epi8(v_g1, v_zero), - _mm_unpacklo_epi8(v_b1, v_zero), - buf + j + 48); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - process(_mm_unpackhi_epi8(v_r1, v_zero), - _mm_unpackhi_epi8(v_g1, v_zero), - _mm_unpackhi_epi8(v_b1, v_zero), - buf + j + 72); + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } } #endif for( ; j < dn*3; j += 3 ) @@ -5961,7 +6035,7 @@ struct Luv2RGB_b } } #elif CV_SSE2 - if (dcn == 3) + if (dcn == 3 && haveSIMD) { for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) { @@ -6004,6 +6078,7 @@ struct Luv2RGB_b #elif CV_SSE2 __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; __m128i v_zero; + bool haveSIMD; #endif }; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 4880819b9e..304210f84e 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1963,9 +1963,9 @@ private: struct ResizeAreaFastVec_SIMD_32f { ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + cn(_cn), step(_step) { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); } int operator() (const float * S, float * D, int w) const @@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f } private: - int scale_x, scale_y; int cn; bool fast_mode; int step; @@ -2289,9 +2288,10 @@ private: struct ResizeAreaFastVec_SIMD_32f { ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + cn(_cn), step(_step) { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); + fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); } int operator() (const float * S, float * D, int w) const @@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f } private: - int scale_x, scale_y; int cn; bool fast_mode; int step; @@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, size.height = 1; } +#if CV_SSE2 + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#endif +#if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); +#endif + const float scale = 1.f/INTER_TAB_SIZE; int x, y; for( y = 0; y < size.height; y++ ) @@ -4848,24 +4854,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_s16(dst1 + (x << 1), v_dst); } #elif CV_SSE4_1 - for( ; x <= size.width - 16; x += 16 ) + if (useSSE4_1) { - __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); - __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); - - __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); - __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); - - _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); - - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); + + __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); + __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); + + _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + } } #endif for( ; x < size.width; x++ ) @@ -4902,47 +4911,50 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } #elif CV_SSE4_1 - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - - for( ; x <= size.width - 16; x += 16 ) + if (useSSE4_1) { - __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); - __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); - __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); - __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); - - __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); - - v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); - v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); - v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); - v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); - - __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); - - _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); - - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); + __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); + __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); + __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); + + __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); + + v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); + v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); + v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); + v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); + + __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); + + _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + } } #endif for( ; x < size.width; x++ ) @@ -5005,25 +5017,28 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } #elif CV_SSE2 - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); - - for( ; x <= size.width - 4; x += 4 ) + if (useSSE2) { - __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); - __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); - - __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), - _mm_srai_epi32(v_src1, INTER_BITS)); - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); - - // x0 y0 x1 y1 . . . - v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), - _mm_and_si128(v_src1, v_its1)); - __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . - _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . - _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); + + for( ; x <= size.width - 4; x += 4 ) + { + __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); + __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); + + __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), + _mm_srai_epi32(v_src1, INTER_BITS)); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + + // x0 y0 x1 y1 . . . + v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), + _mm_and_si128(v_src1, v_its1)); + __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . + _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . + _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + } } #endif for( ; x < size.width; x++ ) @@ -5150,22 +5165,25 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_f32(dst1f + (x << 1) + 8, v_dst); } #elif CV_SSE2 - __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); - __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128 v_scale = _mm_set1_ps(scale); - - for ( ; x <= size.width - 8; x += 8) + if (useSSE2) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); - __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; - __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); - __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); + + for ( ; x <= size.width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); + __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); - __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); + __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); - v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + } } #endif for( ; x < size.width; x++ ) @@ -5204,7 +5222,10 @@ public: const int AB_SCALE = 1 << AB_BITS; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; #if CV_SSE2 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + #if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); #endif int bh0 = std::min(BLOCK_SZ/2, dst.rows); @@ -5243,26 +5264,29 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } #elif CV_SSE4_1 - __m128i v_X0 = _mm_set1_epi32(X0); - __m128i v_Y0 = _mm_set1_epi32(Y0); - for ( ; x1 <= bw - 16; x1 += 16) + if (useSSE4_1) { - __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); - __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); - - __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); - __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); - - _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + __m128i v_X0 = _mm_set1_epi32(X0); + __m128i v_Y0 = _mm_set1_epi32(Y0); + for ( ; x1 <= bw - 16; x1 += 16) + { + __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); + __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); + + __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); + __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); + + _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + } } #endif for( ; x1 < bw; x1++ ) @@ -5278,7 +5302,7 @@ public: short* alpha = A + y1*bw; x1 = 0; #if CV_SSE2 - if( useSIMD ) + if( useSSE2 ) { __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); @@ -5672,6 +5696,7 @@ public: bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); #if CV_SSE4_1 + bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); __m128d v_M0 = _mm_set1_pd(M[0]); __m128d v_M3 = _mm_set1_pd(M[3]); __m128d v_M6 = _mm_set1_pd(M[6]); @@ -5706,109 +5731,112 @@ public: x1 = 0; #if CV_SSE4_1 - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) + if (haveSSE4_1) { - // 0-3 - __m128i v_X0, v_Y0; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); - // 4-8 - __m128i v_X1, v_Y1; + for( ; x1 <= bw - 16; x1 += 16 ) { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // convert to 16s - v_X0 = _mm_packs_epi32(v_X0, v_X1); - v_X1 = _mm_packs_epi32(v_X2, v_X3); - v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); - v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); + // convert to 16s + v_X0 = _mm_packs_epi32(v_X0, v_X1); + v_X1 = _mm_packs_epi32(v_X2, v_X3); + v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); + v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } } #endif @@ -5831,122 +5859,125 @@ public: x1 = 0; #if CV_SSE4_1 - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) + if (haveSSE4_1) { - // 0-3 - __m128i v_X0, v_Y0; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); - // 4-8 - __m128i v_X1, v_Y1; + for( ; x1 <= bw - 16; x1 += 16 ) { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } - // store alpha - __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), - _mm_and_si128(v_X0, v_itsi1)); - __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), - _mm_and_si128(v_X1, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); - - v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), - _mm_and_si128(v_X2, v_itsi1)); - v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), - _mm_and_si128(v_X3, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); - - // convert to 16s - v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); - v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); - v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); - v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); - - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // store alpha + __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), + _mm_and_si128(v_X0, v_itsi1)); + __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), + _mm_and_si128(v_X1, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); + + v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), + _mm_and_si128(v_X2, v_itsi1)); + v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), + _mm_and_si128(v_X3, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); + + // convert to 16s + v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); + v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); + v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); + v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } } #endif diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 93b9bfa166..4271b942ae 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -386,10 +386,10 @@ struct PyrUpVec_32s16s __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); } @@ -446,10 +446,10 @@ struct PyrUpVec_32s16u __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); } @@ -491,7 +491,7 @@ struct PyrUpVec_32f const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; float *dst0 = dst[0], *dst1 = dst[1]; - __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), + __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); for( ; x <= width - 8; x += 8 )