Merge pull request #8907 from vpisarev:dnn_fast_conv

pull/8912/head
Vadim Pisarevsky 8 years ago
commit f49f056d29
  1. 21
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  2. 15
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  3. 8
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  4. 2
      modules/core/src/parallel.cpp

@ -907,6 +907,27 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
return c;
}
/** @brief Sums all elements of each input vector, returns the vector of sums
Scheme:
@code
result[0] = a[0] + a[1] + a[2] + a[3]
result[1] = b[0] + b[1] + b[2] + b[3]
result[2] = c[0] + c[1] + c[2] + c[3]
result[3] = d[0] + d[1] + d[2] + d[3]
@endcode
*/
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
v_float32x4 r;
r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
return r;
}
/** @brief Get negative values mask
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.

@ -815,6 +815,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
float32x4x2_t ab = vtrnq_f32(a.val, b.val);
float32x4x2_t cd = vtrnq_f32(c.val, d.val);
float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
return v_float32x4(vaddq_f32(v0, v1));
}
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \

@ -1126,6 +1126,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
__m128 ab = _mm_hadd_ps(a.val, b.val);
__m128 cd = _mm_hadd_ps(c.val, d.val);
return v_float32x4(_mm_hadd_ps(ab, cd));
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)

@ -425,7 +425,7 @@ int cv::getNumThreads(void)
#elif defined HAVE_GCD
return 512; // the GCD thread pool limit
return cv::getNumberOfCPUs(); // the GCD thread pool limit
#elif defined WINRT

Loading…
Cancel
Save