From fbdcc0e8e4e0ebe2996f32d4cb0716c957eb9bbc Mon Sep 17 00:00:00 2001 From: k-shinotsuka Date: Sun, 8 Apr 2018 01:47:22 +0900 Subject: [PATCH] add universal intrinsics for HSV2RGB_f --- .../include/opencv2/core/hal/intrin_sse.hpp | 70 +++++- modules/imgproc/src/color_hsv.cpp | 209 +++++++----------- 2 files changed, 152 insertions(+), 127 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 0e740f6418..9195d7baeb 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1570,6 +1570,39 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& v_transpose4x4(u0, u1, u2, u3, a, b, c, d); } +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c) +{ + __m128 t0 = _mm_loadu_ps(ptr + 0); + __m128 t1 = _mm_loadu_ps(ptr + 4); + __m128 t2 = _mm_loadu_ps(ptr + 8); + + __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2)); + a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0)); + + __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1)); + __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3)); + b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0)); + + __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2)); + c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0)); +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d) +{ + __m128 t0 = _mm_loadu_ps(ptr + 0); + __m128 t1 = _mm_loadu_ps(ptr + 4); + __m128 t2 = _mm_loadu_ps(ptr + 8); + __m128 t3 = _mm_loadu_ps(ptr + 12); + __m128 t02lo = _mm_unpacklo_ps(t0, t2); + __m128 t13lo = _mm_unpacklo_ps(t1, t3); + __m128 t02hi = _mm_unpackhi_ps(t0, t2); + __m128 t13hi = _mm_unpackhi_ps(t1, t3); + a.val = _mm_unpacklo_ps(t02lo, t13lo); + b.val = _mm_unpackhi_ps(t02lo, t13lo); + c.val = _mm_unpacklo_ps(t02hi, t13hi); + d.val = _mm_unpackhi_ps(t02hi, t13hi); +} + inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) { __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); @@ -1796,6 +1829,41 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 _mm_storeu_ps((ptr + 4), u1); } +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2)); + __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_storeu_ps(ptr + 0, v0); + _mm_storeu_ps(ptr + 4, v1); + _mm_storeu_ps(ptr + 8, v2); +} + +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + __m128 u0 = _mm_unpacklo_ps(a.val, c.val); + __m128 u1 = _mm_unpacklo_ps(b.val, d.val); + __m128 u2 = _mm_unpackhi_ps(a.val, c.val); + __m128 u3 = _mm_unpackhi_ps(b.val, d.val); + __m128 v0 = _mm_unpacklo_ps(u0, u1); + __m128 v2 = _mm_unpacklo_ps(u2, u3); + __m128 v1 = _mm_unpackhi_ps(u0, u1); + __m128 v3 = _mm_unpackhi_ps(u2, u3); + + _mm_storeu_ps(ptr + 0, v0); + _mm_storeu_ps(ptr + 4, v1); + _mm_storeu_ps(ptr + 8, v2); + _mm_storeu_ps(ptr + 12, v3); +} + inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) { __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); @@ -1858,7 +1926,7 @@ inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) +//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) inline v_float32x4 v_cvt_f32(const v_int32x4& a) { diff --git a/modules/imgproc/src/color_hsv.cpp b/modules/imgproc/src/color_hsv.cpp index 513820c3bc..45f5bdb21d 100644 --- a/modules/imgproc/src/color_hsv.cpp +++ b/modules/imgproc/src/color_hsv.cpp @@ -134,159 +134,116 @@ struct HSV2RGB_f HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange) : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SSE2 - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + #if CV_SIMD128 + hasSIMD = hasSIMD128(); #endif } - #if CV_SSE2 - void process(__m128& v_h0, __m128& v_h1, __m128& v_s0, - __m128& v_s1, __m128& v_v0, __m128& v_v1) const + #if CV_SIMD128 + inline void process(v_float32x4& v_h, v_float32x4& v_s, + v_float32x4& v_v, v_float32x4& v_scale) const { - v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale)); - v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale)); - - __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0)); - __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1)); - - v_h0 = _mm_sub_ps(v_h0, v_pre_sector0); - v_h1 = _mm_sub_ps(v_h1, v_pre_sector1); - - __m128 v_tab00 = v_v0; - __m128 v_tab01 = v_v1; - __m128 v_tab10 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), v_s0)); - __m128 v_tab11 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), v_s1)); - __m128 v_tab20 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, v_h0))); - __m128 v_tab21 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, v_h1))); - __m128 v_tab30 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)))); - __m128 v_tab31 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)))); - - __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f)); - __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f)); - v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0)); - v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1)); - v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f)); - v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f)); - v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0); - v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1); - - v_h0 = _mm_and_ps(v_tab10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f))); - v_h1 = _mm_and_ps(v_tab11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_s0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f))); - v_s1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_v0 = _mm_and_ps(v_tab00, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f))); - v_v1 = _mm_and_ps(v_tab01, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f))); - v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f)))); - v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f)))); - v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f)))); - v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f)))); - v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f)))); - v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f)))); - v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f)))); - v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab00, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f)))); - v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab01, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f)))); + v_h = v_h * v_scale; + v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h)); + v_h = v_h - v_pre_sector; + v_float32x4 v_tab0 = v_v; + v_float32x4 v_one = v_setall_f32(1.0f); + v_float32x4 v_tab1 = v_v * (v_one - v_s); + v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h)); + v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h))); + + v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f); + v_float32x4 v_sector = v_pre_sector * v_one_sixth; + v_sector = v_cvt_f32(v_trunc(v_sector)); + v_float32x4 v_six = v_setall_f32(6.0f); + v_sector = v_pre_sector - (v_sector * v_six); + + v_float32x4 v_two = v_setall_f32(2.0f); + v_h = v_tab1 & (v_sector < v_two); + v_h = v_h | (v_tab3 & (v_sector == v_two)); + v_float32x4 v_three = v_setall_f32(3.0f); + v_h = v_h | (v_tab0 & (v_sector == v_three)); + v_float32x4 v_four = v_setall_f32(4.0f); + v_h = v_h | (v_tab0 & (v_sector == v_four)); + v_h = v_h | (v_tab2 & (v_sector > v_four)); + + v_s = v_tab3 & (v_sector < v_one); + v_s = v_s | (v_tab0 & (v_sector == v_one)); + v_s = v_s | (v_tab0 & (v_sector == v_two)); + v_s = v_s | (v_tab2 & (v_sector == v_three)); + v_s = v_s | (v_tab1 & (v_sector > v_three)); + + v_v = v_tab0 & (v_sector < v_one); + v_v = v_v | (v_tab2 & (v_sector == v_one)); + v_v = v_v | (v_tab1 & (v_sector == v_two)); + v_v = v_v | (v_tab1 & (v_sector == v_three)); + v_v = v_v | (v_tab3 & (v_sector == v_four)); + v_v = v_v | (v_tab0 & (v_sector > v_four)); } #endif void operator()(const float* src, float* dst, int n) const { int i = 0, bidx = blueIdx, dcn = dstcn; - float _hscale = hscale; float alpha = ColorChannel::max(); n *= 3; - #if CV_SSE2 - if (haveSIMD) + #if CV_SIMD128 + if (hasSIMD) { - for( ; i <= n - 24; i += 24, dst += dcn * 8 ) + v_float32x4 v_scale = v_setall_f32(hscale); + if (dcn == 3) { - __m128 v_h0 = _mm_loadu_ps(src + i + 0); - __m128 v_h1 = _mm_loadu_ps(src + i + 4); - __m128 v_s0 = _mm_loadu_ps(src + i + 8); - __m128 v_s1 = _mm_loadu_ps(src + i + 12); - __m128 v_v0 = _mm_loadu_ps(src + i + 16); - __m128 v_v1 = _mm_loadu_ps(src + i + 20); - - _mm_deinterleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1); - - process(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1); - - if (dcn == 3) + if (bidx) { - if (bidx) + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1); - - _mm_storeu_ps(dst + 0, v_v0); - _mm_storeu_ps(dst + 4, v_v1); - _mm_storeu_ps(dst + 8, v_s0); - _mm_storeu_ps(dst + 12, v_s1); - _mm_storeu_ps(dst + 16, v_h0); - _mm_storeu_ps(dst + 20, v_h1); + v_float32x4 v_h; + v_float32x4 v_s; + v_float32x4 v_v; + v_load_deinterleave(src + i, v_h, v_s, v_v); + process(v_h, v_s, v_v, v_scale); + v_store_interleave(dst, v_v, v_s, v_h); } - else + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1); - - _mm_storeu_ps(dst + 0, v_h0); - _mm_storeu_ps(dst + 4, v_h1); - _mm_storeu_ps(dst + 8, v_s0); - _mm_storeu_ps(dst + 12, v_s1); - _mm_storeu_ps(dst + 16, v_v0); - _mm_storeu_ps(dst + 20, v_v1); + v_float32x4 v_h; + v_float32x4 v_s; + v_float32x4 v_v; + v_load_deinterleave(src + i, v_h, v_s, v_v); + process(v_h, v_s, v_v, v_scale); + v_store_interleave(dst, v_h, v_s, v_v); } } - else + } else { // dcn == 4 + v_float32x4 v_a = v_setall_f32(alpha); + if (bidx) { - __m128 v_a0 = _mm_set1_ps(alpha); - __m128 v_a1 = _mm_set1_ps(alpha); - if (bidx) + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1, v_a0, v_a1); - - _mm_storeu_ps(dst + 0, v_v0); - _mm_storeu_ps(dst + 4, v_v1); - _mm_storeu_ps(dst + 8, v_s0); - _mm_storeu_ps(dst + 12, v_s1); - _mm_storeu_ps(dst + 16, v_h0); - _mm_storeu_ps(dst + 20, v_h1); - _mm_storeu_ps(dst + 24, v_a0); - _mm_storeu_ps(dst + 28, v_a1); + v_float32x4 v_h; + v_float32x4 v_s; + v_float32x4 v_v; + v_load_deinterleave(src + i, v_h, v_s, v_v); + process(v_h, v_s, v_v, v_scale); + v_store_interleave(dst, v_v, v_s, v_h, v_a); } - else + } else { + for (; i <= n - 12; i += 12, dst += dcn * 4) { - _mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1, v_a0, v_a1); - - _mm_storeu_ps(dst + 0, v_h0); - _mm_storeu_ps(dst + 4, v_h1); - _mm_storeu_ps(dst + 8, v_s0); - _mm_storeu_ps(dst + 12, v_s1); - _mm_storeu_ps(dst + 16, v_v0); - _mm_storeu_ps(dst + 20, v_v1); - _mm_storeu_ps(dst + 24, v_a0); - _mm_storeu_ps(dst + 28, v_a1); + v_float32x4 v_h; + v_float32x4 v_s; + v_float32x4 v_v; + v_load_deinterleave(src + i, v_h, v_s, v_v); + process(v_h, v_s, v_v, v_scale); + v_store_interleave(dst, v_h, v_s, v_v, v_a); } } } } #endif + for( ; i < n; i += 3, dst += dcn ) { float h = src[i], s = src[i+1], v = src[i+2]; @@ -300,7 +257,7 @@ struct HSV2RGB_f {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; float tab[4]; int sector; - h *= _hscale; + h *= hscale; if( h < 0 ) do h += 6; while( h < 0 ); else if( h >= 6 ) @@ -333,8 +290,8 @@ struct HSV2RGB_f int dstcn, blueIdx; float hscale; - #if CV_SSE2 - bool haveSIMD; + #if CV_SIMD128 + bool hasSIMD; #endif };