Merge pull request #11430 from K-Shinotsuka:issue43

pull/11495/head^2
Vadim Pisarevsky 7 years ago
commit 566cb5e3d8
  1. 212
      modules/imgproc/src/color_hsv.cpp

@ -855,167 +855,111 @@ struct HLS2RGB_f
HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
hasSIMD = hasSIMD128();
#endif
}
#if CV_SSE2
void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
__m128& v_l1, __m128& v_s0, __m128& v_s1) const
#if CV_SIMD128
inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const
{
__m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
__m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
__m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
__m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
__m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
__m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
__m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
__m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
__m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
__m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
__m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
__m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
__m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
__m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
__m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
__m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_float32x4 v_one = v_setall_f32(1.0f);
v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f);
v_float32x4 v_ls = v_l * v_s;
v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls);
v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale);
v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw));
v_float32x4 v_hs = v_hs_raw - v_pre_hs;
v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f)));
v_float32x4 v_elem1 = v_hs + v_hs;
v_float32x4 v_tab0 = v_l + v_elem0;
v_float32x4 v_tab1 = v_l - v_elem0;
v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1;
v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1;
v_float32x4 v_two = v_setall_f32(2.0f);
v_float32x4 v_four = v_setall_f32(4.0f);
v_h = v_select(v_sector < v_two , v_tab1,
v_select(v_sector <= v_two , v_tab3,
v_select(v_sector <= v_four, v_tab0, v_tab2)));
v_l = v_select(v_sector < v_one , v_tab3,
v_select(v_sector <= v_two , v_tab0,
v_select(v_sector < v_four, v_tab2, v_tab1)));
v_s = v_select(v_sector < v_one , v_tab0,
v_select(v_sector < v_two , v_tab2,
v_select(v_sector < v_four, v_tab1,
v_select(v_sector <= v_four, v_tab3, v_tab0))));
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, dcn = dstcn;
float _hscale = hscale;
float alpha = ColorChannel<float>::max();
n *= 3;
#if CV_SSE2
if (haveSIMD)
#if CV_SIMD128
if (hasSIMD)
{
for( ; i <= n - 24; i += 24, dst += dcn * 8 )
if (dcn == 3)
{
__m128 v_h0 = _mm_loadu_ps(src + i + 0);
__m128 v_h1 = _mm_loadu_ps(src + i + 4);
__m128 v_l0 = _mm_loadu_ps(src + i + 8);
__m128 v_l1 = _mm_loadu_ps(src + i + 12);
__m128 v_s0 = _mm_loadu_ps(src + i + 16);
__m128 v_s1 = _mm_loadu_ps(src + i + 20);
_mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
if (dcn == 3)
if (bidx)
{
if (bidx)
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_store_interleave(dst, v_s, v_l, v_h);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_store_interleave(dst, v_h, v_l, v_s);
}
}
else
} else { // dcn == 4
if (bidx)
{
__m128 v_a0 = _mm_set1_ps(alpha);
__m128 v_a1 = _mm_set1_ps(alpha);
if (bidx)
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_float32x4 v_a = v_setall_f32(alpha);
v_store_interleave(dst, v_s, v_l, v_h, v_a);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_float32x4 v_a = v_setall_f32(alpha);
v_store_interleave(dst, v_h, v_l, v_s, v_a);
}
}
}
}
#endif
for( ; i < n; i += 3, dst += dcn )
{
float h = src[i], l = src[i+1], s = src[i+2];
@ -1033,7 +977,7 @@ struct HLS2RGB_f
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= _hscale;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -1063,8 +1007,8 @@ struct HLS2RGB_f
int dstcn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#if CV_SIMD128
bool hasSIMD;
#endif
};

Loading…
Cancel
Save