add universal intrinsics for HSV2RGB_f

pull/11264/head
k-shinotsuka 7 years ago
parent 099a16bd86
commit fbdcc0e8e4
  1. 70
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  2. 209
      modules/imgproc/src/color_hsv.cpp

@ -1570,6 +1570,39 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
{
__m128 t0 = _mm_loadu_ps(ptr + 0);
__m128 t1 = _mm_loadu_ps(ptr + 4);
__m128 t2 = _mm_loadu_ps(ptr + 8);
__m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
__m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
__m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
__m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
{
__m128 t0 = _mm_loadu_ps(ptr + 0);
__m128 t1 = _mm_loadu_ps(ptr + 4);
__m128 t2 = _mm_loadu_ps(ptr + 8);
__m128 t3 = _mm_loadu_ps(ptr + 12);
__m128 t02lo = _mm_unpacklo_ps(t0, t2);
__m128 t13lo = _mm_unpacklo_ps(t1, t3);
__m128 t02hi = _mm_unpackhi_ps(t0, t2);
__m128 t13hi = _mm_unpackhi_ps(t1, t3);
a.val = _mm_unpacklo_ps(t02lo, t13lo);
b.val = _mm_unpackhi_ps(t02lo, t13lo);
c.val = _mm_unpacklo_ps(t02hi, t13hi);
d.val = _mm_unpackhi_ps(t02hi, t13hi);
}
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
{
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
@ -1796,6 +1829,41 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
_mm_storeu_ps((ptr + 4), u1);
}
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
{
__m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
__m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
__m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
__m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
__m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
__m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
__m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
__m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
__m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
_mm_storeu_ps(ptr + 0, v0);
_mm_storeu_ps(ptr + 4, v1);
_mm_storeu_ps(ptr + 8, v2);
}
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
__m128 u0 = _mm_unpacklo_ps(a.val, c.val);
__m128 u1 = _mm_unpacklo_ps(b.val, d.val);
__m128 u2 = _mm_unpackhi_ps(a.val, c.val);
__m128 u3 = _mm_unpackhi_ps(b.val, d.val);
__m128 v0 = _mm_unpacklo_ps(u0, u1);
__m128 v2 = _mm_unpacklo_ps(u2, u3);
__m128 v1 = _mm_unpackhi_ps(u0, u1);
__m128 v3 = _mm_unpackhi_ps(u2, u3);
_mm_storeu_ps(ptr + 0, v0);
_mm_storeu_ps(ptr + 4, v1);
_mm_storeu_ps(ptr + 8, v2);
_mm_storeu_ps(ptr + 12, v3);
}
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
{
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
@ -1858,7 +1926,7 @@ inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{

@ -134,159 +134,116 @@ struct HSV2RGB_f
HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
hasSIMD = hasSIMD128();
#endif
}
#if CV_SSE2
void process(__m128& v_h0, __m128& v_h1, __m128& v_s0,
__m128& v_s1, __m128& v_v0, __m128& v_v1) const
{
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
__m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
__m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
__m128 v_tab00 = v_v0;
__m128 v_tab01 = v_v1;
__m128 v_tab10 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), v_s0));
__m128 v_tab11 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), v_s1));
__m128 v_tab20 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, v_h0)));
__m128 v_tab21 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, v_h1)));
__m128 v_tab30 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0))));
__m128 v_tab31 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1))));
__m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
__m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
v_h0 = _mm_and_ps(v_tab10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
v_h1 = _mm_and_ps(v_tab11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_s0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_s1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
v_v0 = _mm_and_ps(v_tab00, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_v1 = _mm_and_ps(v_tab01, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab00, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab01, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
#if CV_SIMD128
inline void process(v_float32x4& v_h, v_float32x4& v_s,
v_float32x4& v_v, v_float32x4& v_scale) const
{
v_h = v_h * v_scale;
v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h));
v_h = v_h - v_pre_sector;
v_float32x4 v_tab0 = v_v;
v_float32x4 v_one = v_setall_f32(1.0f);
v_float32x4 v_tab1 = v_v * (v_one - v_s);
v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h));
v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h)));
v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f);
v_float32x4 v_sector = v_pre_sector * v_one_sixth;
v_sector = v_cvt_f32(v_trunc(v_sector));
v_float32x4 v_six = v_setall_f32(6.0f);
v_sector = v_pre_sector - (v_sector * v_six);
v_float32x4 v_two = v_setall_f32(2.0f);
v_h = v_tab1 & (v_sector < v_two);
v_h = v_h | (v_tab3 & (v_sector == v_two));
v_float32x4 v_three = v_setall_f32(3.0f);
v_h = v_h | (v_tab0 & (v_sector == v_three));
v_float32x4 v_four = v_setall_f32(4.0f);
v_h = v_h | (v_tab0 & (v_sector == v_four));
v_h = v_h | (v_tab2 & (v_sector > v_four));
v_s = v_tab3 & (v_sector < v_one);
v_s = v_s | (v_tab0 & (v_sector == v_one));
v_s = v_s | (v_tab0 & (v_sector == v_two));
v_s = v_s | (v_tab2 & (v_sector == v_three));
v_s = v_s | (v_tab1 & (v_sector > v_three));
v_v = v_tab0 & (v_sector < v_one);
v_v = v_v | (v_tab2 & (v_sector == v_one));
v_v = v_v | (v_tab1 & (v_sector == v_two));
v_v = v_v | (v_tab1 & (v_sector == v_three));
v_v = v_v | (v_tab3 & (v_sector == v_four));
v_v = v_v | (v_tab0 & (v_sector > v_four));
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, dcn = dstcn;
float _hscale = hscale;
float alpha = ColorChannel<float>::max();
n *= 3;
#if CV_SSE2
if (haveSIMD)
{
for( ; i <= n - 24; i += 24, dst += dcn * 8 )
#if CV_SIMD128
if (hasSIMD)
{
__m128 v_h0 = _mm_loadu_ps(src + i + 0);
__m128 v_h1 = _mm_loadu_ps(src + i + 4);
__m128 v_s0 = _mm_loadu_ps(src + i + 8);
__m128 v_s1 = _mm_loadu_ps(src + i + 12);
__m128 v_v0 = _mm_loadu_ps(src + i + 16);
__m128 v_v1 = _mm_loadu_ps(src + i + 20);
_mm_deinterleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
process(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
v_float32x4 v_scale = v_setall_f32(hscale);
if (dcn == 3)
{
if (bidx)
{
_mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1);
_mm_storeu_ps(dst + 0, v_v0);
_mm_storeu_ps(dst + 4, v_v1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
v_float32x4 v_h;
v_float32x4 v_s;
v_float32x4 v_v;
v_load_deinterleave(src + i, v_h, v_s, v_v);
process(v_h, v_s, v_v, v_scale);
v_store_interleave(dst, v_v, v_s, v_h);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_v0);
_mm_storeu_ps(dst + 20, v_v1);
v_float32x4 v_h;
v_float32x4 v_s;
v_float32x4 v_v;
v_load_deinterleave(src + i, v_h, v_s, v_v);
process(v_h, v_s, v_v, v_scale);
v_store_interleave(dst, v_h, v_s, v_v);
}
}
else
{
__m128 v_a0 = _mm_set1_ps(alpha);
__m128 v_a1 = _mm_set1_ps(alpha);
} else { // dcn == 4
v_float32x4 v_a = v_setall_f32(alpha);
if (bidx)
{
_mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_v0);
_mm_storeu_ps(dst + 4, v_v1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
v_float32x4 v_h;
v_float32x4 v_s;
v_float32x4 v_v;
v_load_deinterleave(src + i, v_h, v_s, v_v);
process(v_h, v_s, v_v, v_scale);
v_store_interleave(dst, v_v, v_s, v_h, v_a);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_v0);
_mm_storeu_ps(dst + 20, v_v1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
v_float32x4 v_h;
v_float32x4 v_s;
v_float32x4 v_v;
v_load_deinterleave(src + i, v_h, v_s, v_v);
process(v_h, v_s, v_v, v_scale);
v_store_interleave(dst, v_h, v_s, v_v, v_a);
}
}
}
}
#endif
for( ; i < n; i += 3, dst += dcn )
{
float h = src[i], s = src[i+1], v = src[i+2];
@ -300,7 +257,7 @@ struct HSV2RGB_f
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
float tab[4];
int sector;
h *= _hscale;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -333,8 +290,8 @@ struct HSV2RGB_f
int dstcn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#if CV_SIMD128
bool hasSIMD;
#endif
};

Loading…
Cancel
Save