|
|
|
@ -141,6 +141,39 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab |
|
|
|
|
return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
template<typename _Tp> static inline void splineInterpolate(__m128& v_x, const _Tp* tab, int n) |
|
|
|
|
{ |
|
|
|
|
__m128i v_ix = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1)))); |
|
|
|
|
v_x = _mm_sub_ps(v_x, _mm_cvtepi32_ps(v_ix)); |
|
|
|
|
v_ix = _mm_slli_epi32(v_ix, 2); |
|
|
|
|
|
|
|
|
|
int CV_DECL_ALIGNED(16) ix[4]; |
|
|
|
|
_mm_store_si128((__m128i *)ix, v_ix); |
|
|
|
|
|
|
|
|
|
__m128 v_tab0 = _mm_loadu_ps(tab + ix[0]); |
|
|
|
|
__m128 v_tab1 = _mm_loadu_ps(tab + ix[1]); |
|
|
|
|
__m128 v_tab2 = _mm_loadu_ps(tab + ix[2]); |
|
|
|
|
__m128 v_tab3 = _mm_loadu_ps(tab + ix[3]); |
|
|
|
|
|
|
|
|
|
__m128 v_tmp0 = _mm_unpacklo_ps(v_tab0, v_tab1); |
|
|
|
|
__m128 v_tmp1 = _mm_unpacklo_ps(v_tab2, v_tab3); |
|
|
|
|
__m128 v_tmp2 = _mm_unpackhi_ps(v_tab0, v_tab1); |
|
|
|
|
__m128 v_tmp3 = _mm_unpackhi_ps(v_tab2, v_tab3); |
|
|
|
|
|
|
|
|
|
v_tab0 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0x44); |
|
|
|
|
v_tab2 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0x44); |
|
|
|
|
v_tab1 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0xee); |
|
|
|
|
v_tab3 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0xee); |
|
|
|
|
|
|
|
|
|
__m128 v_l = _mm_mul_ps(v_x, v_tab3); |
|
|
|
|
v_l = _mm_add_ps(v_l, v_tab2); |
|
|
|
|
v_l = _mm_mul_ps(v_l, v_x); |
|
|
|
|
v_l = _mm_add_ps(v_l, v_tab1); |
|
|
|
|
v_l = _mm_mul_ps(v_l, v_x); |
|
|
|
|
v_x = _mm_add_ps(v_l, v_tab0); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template<typename _Tp> struct ColorChannel |
|
|
|
|
{ |
|
|
|
@ -5766,24 +5799,146 @@ struct RGB2Luv_f |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3); |
|
|
|
|
un = 4*whitept[0]*d; |
|
|
|
|
vn = 9*whitept[1]*d; |
|
|
|
|
un = 4*whitept[0]*d*13; |
|
|
|
|
vn = 9*whitept[1]*d*13; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
haveSIMD = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
CV_Assert(whitept[1] == 1.f); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
void process(__m128& v_r0, __m128& v_r1, __m128& v_g0, |
|
|
|
|
__m128& v_g1, __m128& v_b0, __m128& v_b1) const |
|
|
|
|
{ |
|
|
|
|
__m128 v_x0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[0])); |
|
|
|
|
__m128 v_x1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[0])); |
|
|
|
|
__m128 v_y0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[3])); |
|
|
|
|
__m128 v_y1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[3])); |
|
|
|
|
__m128 v_z0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[6])); |
|
|
|
|
__m128 v_z1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[6])); |
|
|
|
|
|
|
|
|
|
v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[1]))); |
|
|
|
|
v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[1]))); |
|
|
|
|
v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[4]))); |
|
|
|
|
v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[4]))); |
|
|
|
|
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[7]))); |
|
|
|
|
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[7]))); |
|
|
|
|
|
|
|
|
|
v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[2]))); |
|
|
|
|
v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[2]))); |
|
|
|
|
v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[5]))); |
|
|
|
|
v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[5]))); |
|
|
|
|
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[8]))); |
|
|
|
|
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[8]))); |
|
|
|
|
|
|
|
|
|
__m128 v_l0 = _mm_mul_ps(v_y0, _mm_set1_ps(LabCbrtTabScale)); |
|
|
|
|
__m128 v_l1 = _mm_mul_ps(v_y1, _mm_set1_ps(LabCbrtTabScale)); |
|
|
|
|
splineInterpolate(v_l0, LabCbrtTab, LAB_CBRT_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_l1, LabCbrtTab, LAB_CBRT_TAB_SIZE); |
|
|
|
|
|
|
|
|
|
v_l0 = _mm_mul_ps(v_l0, _mm_set1_ps(116.0f)); |
|
|
|
|
v_l1 = _mm_mul_ps(v_l1, _mm_set1_ps(116.0f)); |
|
|
|
|
v_r0 = _mm_sub_ps(v_l0, _mm_set1_ps(16.0f)); |
|
|
|
|
v_r1 = _mm_sub_ps(v_l1, _mm_set1_ps(16.0f)); |
|
|
|
|
|
|
|
|
|
v_z0 = _mm_mul_ps(v_z0, _mm_set1_ps(3.0f)); |
|
|
|
|
v_z1 = _mm_mul_ps(v_z1, _mm_set1_ps(3.0f)); |
|
|
|
|
v_z0 = _mm_add_ps(v_z0, v_x0); |
|
|
|
|
v_z1 = _mm_add_ps(v_z1, v_x1); |
|
|
|
|
v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_y0, _mm_set1_ps(15.0f))); |
|
|
|
|
v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_y1, _mm_set1_ps(15.0f))); |
|
|
|
|
v_z0 = _mm_max_ps(v_z0, _mm_set1_ps(FLT_EPSILON)); |
|
|
|
|
v_z1 = _mm_max_ps(v_z1, _mm_set1_ps(FLT_EPSILON)); |
|
|
|
|
__m128 v_d0 = _mm_div_ps(_mm_set1_ps(52.0f), v_z0); |
|
|
|
|
__m128 v_d1 = _mm_div_ps(_mm_set1_ps(52.0f), v_z1); |
|
|
|
|
|
|
|
|
|
v_x0 = _mm_mul_ps(v_x0, v_d0); |
|
|
|
|
v_x1 = _mm_mul_ps(v_x1, v_d1); |
|
|
|
|
v_x0 = _mm_sub_ps(v_x0, _mm_set1_ps(un)); |
|
|
|
|
v_x1 = _mm_sub_ps(v_x1, _mm_set1_ps(un)); |
|
|
|
|
v_g0 = _mm_mul_ps(v_x0, v_r0); |
|
|
|
|
v_g1 = _mm_mul_ps(v_x1, v_r1); |
|
|
|
|
|
|
|
|
|
v_y0 = _mm_mul_ps(v_y0, v_d0); |
|
|
|
|
v_y1 = _mm_mul_ps(v_y1, v_d1); |
|
|
|
|
v_y0 = _mm_mul_ps(v_y0, _mm_set1_ps(2.25f)); |
|
|
|
|
v_y1 = _mm_mul_ps(v_y1, _mm_set1_ps(2.25f)); |
|
|
|
|
v_y0 = _mm_sub_ps(v_y0, _mm_set1_ps(vn)); |
|
|
|
|
v_y1 = _mm_sub_ps(v_y1, _mm_set1_ps(vn)); |
|
|
|
|
v_b0 = _mm_mul_ps(v_y0, v_r0); |
|
|
|
|
v_b1 = _mm_mul_ps(v_y1, v_r1); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
void operator()(const float* src, float* dst, int n) const |
|
|
|
|
{ |
|
|
|
|
int i, scn = srccn; |
|
|
|
|
int i = 0, scn = srccn; |
|
|
|
|
float gscale = GammaTabScale; |
|
|
|
|
const float* gammaTab = srgb ? sRGBGammaTab : 0; |
|
|
|
|
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], |
|
|
|
|
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], |
|
|
|
|
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; |
|
|
|
|
float _un = 13*un, _vn = 13*vn; |
|
|
|
|
n *= 3; |
|
|
|
|
|
|
|
|
|
for( i = 0; i < n; i += 3, src += scn ) |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
for( ; i <= n - 24; i += 24, src += scn * 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 v_r0 = _mm_loadu_ps(src + 0); |
|
|
|
|
__m128 v_r1 = _mm_loadu_ps(src + 4); |
|
|
|
|
__m128 v_g0 = _mm_loadu_ps(src + 8); |
|
|
|
|
__m128 v_g1 = _mm_loadu_ps(src + 12); |
|
|
|
|
__m128 v_b0 = _mm_loadu_ps(src + 16); |
|
|
|
|
__m128 v_b1 = _mm_loadu_ps(src + 20); |
|
|
|
|
|
|
|
|
|
if (scn == 3) |
|
|
|
|
{ |
|
|
|
|
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
__m128 v_a0 = _mm_loadu_ps(src + 24); |
|
|
|
|
__m128 v_a1 = _mm_loadu_ps(src + 28); |
|
|
|
|
|
|
|
|
|
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if ( gammaTab ) |
|
|
|
|
{ |
|
|
|
|
__m128 v_gscale = _mm_set1_ps(gscale); |
|
|
|
|
v_r0 = _mm_mul_ps(v_r0, v_gscale); |
|
|
|
|
v_r1 = _mm_mul_ps(v_r1, v_gscale); |
|
|
|
|
v_g0 = _mm_mul_ps(v_g0, v_gscale); |
|
|
|
|
v_g1 = _mm_mul_ps(v_g1, v_gscale); |
|
|
|
|
v_b0 = _mm_mul_ps(v_b0, v_gscale); |
|
|
|
|
v_b1 = _mm_mul_ps(v_b1, v_gscale); |
|
|
|
|
|
|
|
|
|
splineInterpolate(v_r0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_r1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_g0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_g1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_b0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_b1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
process(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); |
|
|
|
|
|
|
|
|
|
_mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(dst + i + 0, v_r0); |
|
|
|
|
_mm_storeu_ps(dst + i + 4, v_r1); |
|
|
|
|
_mm_storeu_ps(dst + i + 8, v_g0); |
|
|
|
|
_mm_storeu_ps(dst + i + 12, v_g1); |
|
|
|
|
_mm_storeu_ps(dst + i + 16, v_b0); |
|
|
|
|
_mm_storeu_ps(dst + i + 20, v_b1); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; i < n; i += 3, src += scn ) |
|
|
|
|
{ |
|
|
|
|
float R = src[0], G = src[1], B = src[2]; |
|
|
|
|
if( gammaTab ) |
|
|
|
@ -5801,8 +5956,8 @@ struct RGB2Luv_f |
|
|
|
|
L = 116.f*L - 16.f; |
|
|
|
|
|
|
|
|
|
float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON); |
|
|
|
|
float u = L*(X*d - _un); |
|
|
|
|
float v = L*((9*0.25f)*Y*d - _vn); |
|
|
|
|
float u = L*(X*d - un); |
|
|
|
|
float v = L*((9*0.25f)*Y*d - vn); |
|
|
|
|
|
|
|
|
|
dst[i] = L; dst[i+1] = u; dst[i+2] = v; |
|
|
|
|
} |
|
|
|
@ -5811,6 +5966,9 @@ struct RGB2Luv_f |
|
|
|
|
int srccn; |
|
|
|
|
float coeffs[9], un, vn; |
|
|
|
|
bool srgb; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSIMD; |
|
|
|
|
#endif |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|