remove raw SSE2/NEON implementation from imgwarp.cpp

* use universal intrinsic instead of raw intrinsic
  * add 2 channels de-interleave on x86 platform
  * add v_int32x4 version of v_muladd
  * add accumulate version of v_dotprod based on the commit from seiko2plus on bf1852d
  * remove some verify check in performance test
  * avoid the out of boundary access and keep the performance
pull/11169/head
Tomoaki Teshima 7 years ago
parent fdd83e5027
commit a82e70cd40
  1. 25
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  2. 25
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  3. 35
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  4. 6
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  5. 14
      modules/core/test/test_intrin_utils.hpp
  6. 2
      modules/imgproc/perf/opencl/perf_imgwarp.cpp
  7. 6
      modules/imgproc/perf/perf_warp.cpp
  8. 998
      modules/imgproc/src/imgwarp.cpp

@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
/** @brief Multiply and add
Returns \f$ a*b + c \f$
For floating point types only. */
For floating point types and signed 32bit int only. */
template<typename _Tp, int n>
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
const v_reg<_Tp, n>& c)
@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
return c;
}
/** @brief Dot product of elements
Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
Scheme:
@code
{A1 A2 ...} // 16-bit
x {B1 B2 ...} // 16-bit
-------------
{A1B1+A2B2+C1 ...} // 32-bit
@endcode
Implemented only for 16-bit signed source type (v_int16x8).
*/
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
{
typedef typename V_TypeTraits<_Tp>::w_type w_type;
v_reg<w_type, n/2> s;
for( int i = 0; i < (n/2); i++ )
s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
return s;
}
/** @brief Multiply and expand
Multiply values two registers and store results in two registers with wider pack type.

@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
v_int32x4 s = v_dotprod(a, b);
return v_int32x4(vaddq_s32(s.val , c.val));
}
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
}
#if CV_SIMD128_64F
inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
{
@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif
#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
float32x4_t a_ = a.val;
int32x4_t result;
__asm__ ("fcvtns %0.4s, %1.4s"
: "=w"(result)
: "w"(a_)
: /* No clobbers */);
return v_int32x4(result);
}
#else
inline v_int32x4 v_round(const v_float32x4& a)
{
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
}
#endif
inline v_int32x4 v_floor(const v_float32x4& a)
{
int32x4_t a1 = vcvtq_s32_f32(a.val);

@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
return v_int32x4(_mm_madd_epi16(a.val, b.val));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
}
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
}
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
}
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@ -1599,7 +1608,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
c = v_reinterpret_as_f64(t2);
}
// 2-channel, float only
// 2-channel
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
@ -1611,7 +1620,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
}
inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
{
v_int16x8 sa, sb;
v_load_deinterleave((const short*)ptr, sa, sb);
a = v_reinterpret_as_u16(sa);
b = v_reinterpret_as_u16(sb);
}
inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
{
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(a.val, b.val);

@ -821,6 +821,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{ return a * b + c; }
// TODO: exp, log, sin, cos
/** Absolute values **/
@ -904,6 +907,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)

@ -521,15 +521,25 @@ template<typename R> struct TheTest
TheTest & test_dot_prod()
{
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
typedef typename Rx2::lane_type w_type;
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
Data<Rx2> res = v_dotprod(a, b);
Data<Rx2> dataC;
dataC += std::numeric_limits<w_type>::is_signed ?
std::numeric_limits<w_type>::min() :
std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
Rx2 c = dataC;
Data<Rx2> resD = v_dotprod(a, b),
resE = v_dotprod(a, b, c);
const int n = R::nlanes / 2;
for (int i = 0; i < n; ++i)
{
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
}
return *this;
}

@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
SANITY_CHECK(dst, eps);
SANITY_CHECK_NOTHING();
}
} } // namespace opencv_test::ocl

@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
PERF_TEST_P( TestRemap, remap,
Combine(
Values( TYPICAL_MAT_TYPES ),
Values( szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
Values( szVGA, sz1080p ),
InterType::all(),
BorderMode::all(),
RemapMode::all()
@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
remap(source, destination, map_x, map_y, interpolationType, borderMode);
}
SANITY_CHECK(destination, 1);
SANITY_CHECK_NOTHING();
}
void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save