pull/3591/head
Ilya Lavrenov 10 years ago
parent 1d3c860411
commit 6bce6ee34a
  1. 89
      modules/core/src/arithm.cpp
  2. 62
      modules/core/src/convert.cpp
  3. 102
      modules/core/src/mathfuncs.cpp
  4. 6
      modules/core/src/stat.cpp
  5. 595
      modules/imgproc/src/color.cpp
  6. 671
      modules/imgproc/src/imgwarp.cpp
  7. 10
      modules/imgproc/src/pyramids.cpp

@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128);
#if CV_SSE2
FUNCTOR_TEMPLATE(VLoadStore64);
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
#if CV_AVX
#if CV_AVX2
FUNCTOR_TEMPLATE(VLoadStore256);
FUNCTOR_TEMPLATE(VLoadStore256Aligned);
#endif
@ -2626,10 +2626,16 @@ struct Div_SIMD
template <>
struct Div_SIMD<uchar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2672,10 +2678,16 @@ struct Div_SIMD<uchar>
template <>
struct Div_SIMD<schar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2718,10 +2730,16 @@ struct Div_SIMD<schar>
template <>
struct Div_SIMD<ushort>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2763,10 +2781,16 @@ struct Div_SIMD<ushort>
template <>
struct Div_SIMD<short>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2806,10 +2830,16 @@ struct Div_SIMD<short>
template <>
struct Div_SIMD<int>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2902,10 +2932,16 @@ struct Recip_SIMD
template <>
struct Recip_SIMD<uchar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const uchar * src2, uchar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2941,10 +2977,16 @@ struct Recip_SIMD<uchar>
template <>
struct Recip_SIMD<schar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const schar * src2, schar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -2980,10 +3022,16 @@ struct Recip_SIMD<schar>
template <>
struct Recip_SIMD<ushort>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const ushort * src2, ushort * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -3018,10 +3066,16 @@ struct Recip_SIMD<ushort>
template <>
struct Recip_SIMD<short>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const short * src2, short * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -3054,10 +3108,16 @@ struct Recip_SIMD<short>
template <>
struct Recip_SIMD<int>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const int * src2, int * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
{
int x =0;
#if CV_SSE2
if( USE_SSE2 ){
if( USE_SSE2 )
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
__m128i c128 = _mm_set1_epi8 (-128);
for( ; x <= size.width - 16; x += 16 )
@ -4142,7 +4203,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
}
}
#elif CV_NEON
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= size.width - 16; x += 16 )
@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 ){
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
for( ; x <= size.width - 16; x += 16 )
{
@ -4174,7 +4236,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
_mm_storeu_si128((__m128i*)(dst + x), r00);
}
}
#elif CV_NEON
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= size.width - 16; x += 16 )
@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
{
int x =0;
#if CV_SSE2
if( USE_SSE2){//
if( USE_SSE2)
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= size.width - 16; x += 16 )
{
@ -4278,7 +4341,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
x += 8;
}
}
#elif CV_NEON
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= size.width - 16; x += 16 )
@ -4293,8 +4356,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
#endif
for( ; x < size.width; x++ ){
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 ){
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= size.width - 16; x += 16 )
{
@ -4332,7 +4395,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
x += 8;
}
}
#elif CV_NEON
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= size.width - 16; x += 16 )
@ -4347,8 +4410,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
for( ; x < size.width; x++ )
#endif
for( ; x < size.width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}

@ -158,7 +158,7 @@ struct VSplit2<data_type>
\
VSplit2() \
{ \
support = true; \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
@ -191,7 +191,7 @@ struct VSplit3<data_type>
\
VSplit3() \
{ \
support = true; \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
@ -229,7 +229,7 @@ struct VSplit4<data_type>
\
VSplit4() \
{ \
support = true; \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
@ -502,7 +502,7 @@ struct VMerge4
bool support;
};
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge2<data_type> \
{ \
@ -513,7 +513,7 @@ struct VMerge2<data_type>
\
VMerge2() \
{ \
support = true; \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
@ -535,7 +535,7 @@ struct VMerge2<data_type>
bool support; \
}
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge3<data_type> \
{ \
@ -546,7 +546,7 @@ struct VMerge3<data_type>
\
VMerge3() \
{ \
support = true; \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
@ -573,7 +573,7 @@ struct VMerge3<data_type>
bool support; \
}
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor) \
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge4<data_type> \
{ \
@ -584,7 +584,7 @@ struct VMerge4<data_type>
\
VMerge4() \
{ \
support = true; \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
@ -616,19 +616,19 @@ struct VMerge4<data_type>
bool support; \
}
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128);
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
#if CV_SSE4_1
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128);
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
#endif
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps);
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
#endif
@ -4404,6 +4404,9 @@ struct Cvt_SIMD<double, uchar>
{
int x = 0;
if (!USE_SSE2)
return x;
for ( ; x <= width - 8; x += 8)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -4430,6 +4433,9 @@ struct Cvt_SIMD<double, schar>
{
int x = 0;
if (!USE_SSE2)
return x;
for ( ; x <= width - 8; x += 8)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -4454,10 +4460,16 @@ struct Cvt_SIMD<double, schar>
template <>
struct Cvt_SIMD<double, ushort>
{
bool haveSIMD;
Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const double * src, ushort * dst, int width) const
{
int x = 0;
if (!haveSIMD)
return x;
for ( ; x <= width - 8; x += 8)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -4486,6 +4498,9 @@ struct Cvt_SIMD<double, short>
{
int x = 0;
if (!USE_SSE2)
return x;
for ( ; x <= width - 8; x += 8)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -4512,6 +4527,9 @@ struct Cvt_SIMD<double, int>
{
int x = 0;
if (!USE_SSE2)
return x;
for ( ; x <= width - 4; x += 4)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -4532,6 +4550,9 @@ struct Cvt_SIMD<double, float>
{
int x = 0;
if (!USE_SSE2)
return x;
for ( ; x <= width - 4; x += 4)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
@ -5114,8 +5135,9 @@ cvt_<float, short>( const float* src, size_t sstep,
{
int x = 0;
#if CV_SSE2
if(USE_SSE2){
for( ; x <= size.width - 8; x += 8 )
if(USE_SSE2)
{
for( ; x <= size.width - 8; x += 8 )
{
__m128 src128 = _mm_loadu_ps (src + x);
__m128i src_int128 = _mm_cvtps_epi32 (src128);

@ -597,15 +597,18 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
k = 0;
#if CV_SSE2
for ( ; k <= len - 4; k += 4)
if (USE_SSE2)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
for ( ; k <= len - 4; k += 4)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
}
}
#endif
@ -619,11 +622,14 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
k = 0;
#if CV_SSE2
for ( ; k <= len - 4; k += 4)
if (USE_SSE2)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
for ( ; k <= len - 4; k += 4)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
}
}
#endif
@ -728,15 +734,18 @@ void cartToPolar( InputArray src1, InputArray src2,
k = 0;
#if CV_SSE2
for ( ; k <= len - 4; k += 4)
if (USE_SSE2)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
for ( ; k <= len - 4; k += 4)
{
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
_mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
__m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
_mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
_mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1);
}
}
#endif
@ -750,11 +759,14 @@ void cartToPolar( InputArray src1, InputArray src2,
k = 0;
#if CV_SSE2
for ( ; k <= len - 4; k += 4)
if (USE_SSE2)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
for ( ; k <= len - 4; k += 4)
{
__m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
}
}
#endif
@ -832,17 +844,16 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
k1 = N/360.;
#if CV_AVX2
__m128d v_i = _mm_set_pd(1, 0);
__m128d v_k1 = _mm_set1_pd(k1);
__m128d v_1 = _mm_set1_pd(1);
__m128i v_N1 = _mm_set1_epi32(N - 1);
__m128i v_N4 = _mm_set1_epi32(N >> 2);
__m128d v_sin_a0 = _mm_set1_pd(sin_a0);
__m128d v_sin_a2 = _mm_set1_pd(sin_a2);
__m128d v_cos_a0 = _mm_set1_pd(cos_a0);
if (USE_AVX2)
{
__m128d v_k1 = _mm_set1_pd(k1);
__m128d v_1 = _mm_set1_pd(1);
__m128i v_N1 = _mm_set1_epi32(N - 1);
__m128i v_N4 = _mm_set1_epi32(N >> 2);
__m128d v_sin_a0 = _mm_set1_pd(sin_a0);
__m128d v_sin_a2 = _mm_set1_pd(sin_a2);
__m128d v_cos_a0 = _mm_set1_pd(cos_a0);
for ( ; i <= len - 4; i += 4)
{
__m128 v_angle = _mm_loadu_ps(angle + i);
@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
__m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
__m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
__m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1);
__m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1);
__m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
__m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b));
@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
_mm_mul_pd(v_sin_a, v_sin_b));
// 2-3
v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
v_it = _mm_cvtpd_epi32(v_t);
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1);
v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1);
v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b));
@ -1032,11 +1043,14 @@ void polarToCart( InputArray src1, InputArray src2,
vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
}
#elif CV_SSE2
for( ; k <= len - 4; k += 4 )
if (USE_SSE2)
{
__m128 v_m = _mm_loadu_ps(mag + k);
_mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
_mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
for( ; k <= len - 4; k += 4 )
{
__m128 v_m = _mm_loadu_ps(mag + k);
_mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
_mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
}
}
#endif
@ -1063,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2,
x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
}
else
for( k = 0; k < len; k++ )
{
x[k] = buf[0][k]; y[k] = buf[1][k];
}
{
std::memcpy(x, buf[0], sizeof(float) * len);
std::memcpy(y, buf[1], sizeof(float) * len);
}
}
if( ptrs[0] )

@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len )
return nz;
}
#if CV_SSE2
static const uchar * initPopcountTable()
{
static uchar tab[256];
@ -425,6 +427,8 @@ static const uchar * initPopcountTable()
return tab;
}
#endif
static int countNonZero8u( const uchar* src, int len )
{
int i=0, nz = 0;
@ -645,7 +649,7 @@ static int countNonZero32f( const float* src, int len )
}
static int countNonZero64f( const double* src, int len )
{
{
int i = 0, nz = 0;
#if CV_SSE2
if (USE_SSE2)

File diff suppressed because it is too large Load Diff

@ -1963,9 +1963,9 @@ private:
struct ResizeAreaFastVec_SIMD_32f
{
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
cn(_cn), step(_step)
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
}
int operator() (const float * S, float * D, int w) const
@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
}
private:
int scale_x, scale_y;
int cn;
bool fast_mode;
int step;
@ -2289,9 +2288,10 @@ private:
struct ResizeAreaFastVec_SIMD_32f
{
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
cn(_cn), step(_step)
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const float * S, float * D, int w) const
@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f
}
private:
int scale_x, scale_y;
int cn;
bool fast_mode;
int step;
@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
size.height = 1;
}
#if CV_SSE2
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif
const float scale = 1.f/INTER_TAB_SIZE;
int x, y;
for( y = 0; y < size.height; y++ )
@ -4848,24 +4854,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_s16(dst1 + (x << 1), v_dst);
}
#elif CV_SSE4_1
for( ; x <= size.width - 16; x += 16 )
if (useSSE4_1)
{
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
for( ; x <= size.width - 16; x += 16 )
{
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
}
}
#endif
for( ; x < size.width; x++ )
@ -4902,47 +4911,50 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
}
#elif CV_SSE4_1
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
for( ; x <= size.width - 16; x += 16 )
if (useSSE4_1)
{
__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
for( ; x <= size.width - 16; x += 16 )
{
__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
_mm_srai_epi32(v_ix1, INTER_BITS));
__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
_mm_srai_epi32(v_iy1, INTER_BITS));
v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
_mm_and_si128(v_ix0, v_its1));
v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
_mm_and_si128(v_ix1, v_its1));
_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
}
}
#endif
for( ; x < size.width; x++ )
@ -5005,25 +5017,28 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
}
#elif CV_SSE2
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
for( ; x <= size.width - 4; x += 4 )
if (useSSE2)
{
__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
_mm_srai_epi32(v_src1, INTER_BITS));
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
// x0 y0 x1 y1 . . .
v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
_mm_and_si128(v_src1, v_its1));
__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
for( ; x <= size.width - 4; x += 4 )
{
__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
_mm_srai_epi32(v_src1, INTER_BITS));
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
// x0 y0 x1 y1 . . .
v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
_mm_and_si128(v_src1, v_its1));
__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
}
}
#endif
for( ; x < size.width; x++ )
@ -5150,22 +5165,25 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_f32(dst1f + (x << 1) + 8, v_dst);
}
#elif CV_SSE2
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= size.width - 8; x += 8)
if (useSSE2)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
__m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= size.width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
__m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
__m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
__m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
__m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
__m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
}
}
#endif
for( ; x < size.width; x++ )
@ -5204,7 +5222,10 @@ public:
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
@ -5243,26 +5264,29 @@ public:
vst2q_s16(xy + (x1 << 1), v_dst);
}
#elif CV_SSE4_1
__m128i v_X0 = _mm_set1_epi32(X0);
__m128i v_Y0 = _mm_set1_epi32(Y0);
for ( ; x1 <= bw - 16; x1 += 16)
if (useSSE4_1)
{
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
__m128i v_X0 = _mm_set1_epi32(X0);
__m128i v_Y0 = _mm_set1_epi32(Y0);
for ( ; x1 <= bw - 16; x1 += 16)
{
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
}
}
#endif
for( ; x1 < bw; x1++ )
@ -5278,7 +5302,7 @@ public:
short* alpha = A + y1*bw;
x1 = 0;
#if CV_SSE2
if( useSIMD )
if( useSSE2 )
{
__m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
__m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
@ -5672,6 +5696,7 @@ public:
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
#if CV_SSE4_1
bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
__m128d v_M0 = _mm_set1_pd(M[0]);
__m128d v_M3 = _mm_set1_pd(M[3]);
__m128d v_M6 = _mm_set1_pd(M[6]);
@ -5706,109 +5731,112 @@ public:
x1 = 0;
#if CV_SSE4_1
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
for( ; x1 <= bw - 16; x1 += 16 )
if (haveSSE4_1)
{
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
// 4-8
__m128i v_X1, v_Y1;
for( ; x1 <= bw - 16; x1 += 16 )
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 4-8
__m128i v_X1, v_Y1;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// convert to 16s
v_X0 = _mm_packs_epi32(v_X0, v_X1);
v_X1 = _mm_packs_epi32(v_X2, v_X3);
v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
// convert to 16s
v_X0 = _mm_packs_epi32(v_X0, v_X1);
v_X1 = _mm_packs_epi32(v_X2, v_X3);
v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
}
}
#endif
@ -5831,122 +5859,125 @@ public:
x1 = 0;
#if CV_SSE4_1
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
for( ; x1 <= bw - 16; x1 += 16 )
if (haveSSE4_1)
{
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
__m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0);
__m128d v_x1 = _mm_set_pd(1, 0);
// 4-8
__m128i v_X1, v_Y1;
for( ; x1 <= bw - 16; x1 += 16 )
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 0-3
__m128i v_X0, v_Y0;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 4-8
__m128i v_X1, v_Y1;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// 8-11
__m128i v_X2, v_Y2;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// store alpha
__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
_mm_and_si128(v_X0, v_itsi1));
__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
_mm_and_si128(v_X1, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
_mm_and_si128(v_X2, v_itsi1));
v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
_mm_and_si128(v_X3, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
// convert to 16s
v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
// 12-15
__m128i v_X3, v_Y3;
{
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
v_x1 = _mm_add_pd(v_x1, v_2);
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
}
// store alpha
__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
_mm_and_si128(v_X0, v_itsi1));
__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
_mm_and_si128(v_X1, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
_mm_and_si128(v_X2, v_itsi1));
v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
_mm_and_si128(v_X3, v_itsi1));
_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
// convert to 16s
v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
}
}
#endif

@ -386,10 +386,10 @@ struct PyrUpVec_32s16s
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
_mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
_mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
}
@ -446,10 +446,10 @@ struct PyrUpVec_32s16u
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_storeu_si128((__m128i *)(dst0 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
_mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_storeu_si128((__m128i *)(dst1 + x),
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
_mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
}
@ -491,7 +491,7 @@ struct PyrUpVec_32f
const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
float *dst0 = dst[0], *dst1 = dst[1];
__m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
__m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
for( ; x <= width - 8; x += 8 )

Loading…
Cancel
Save