|
|
|
@ -144,7 +144,7 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
template<typename _Tp> static inline void splineInterpolate(__m128& v_x, const _Tp* tab, int n) |
|
|
|
|
{ |
|
|
|
|
__m128i v_ix = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1)))); |
|
|
|
|
__m128i v_ix = _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1)))); |
|
|
|
|
v_x = _mm_sub_ps(v_x, _mm_cvtepi32_ps(v_ix)); |
|
|
|
|
v_ix = _mm_slli_epi32(v_ix, 2); |
|
|
|
|
|
|
|
|
@ -5474,11 +5474,106 @@ struct Lab2RGB_f |
|
|
|
|
coeffs[i+3] = _coeffs[i+3]*_whitept[i]; |
|
|
|
|
coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
lThresh = 0.008856f * 903.3f; |
|
|
|
|
fThresh = 7.787f * 0.008856f + 16.0f / 116.0f; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
haveSIMD = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
void process(__m128& v_li0, __m128& v_li1, __m128& v_ai0, |
|
|
|
|
__m128& v_ai1, __m128& v_bi0, __m128& v_bi1) const |
|
|
|
|
{ |
|
|
|
|
__m128 v_y00 = _mm_mul_ps(v_li0, _mm_set1_ps(1.0f/903.3f)); |
|
|
|
|
__m128 v_y01 = _mm_mul_ps(v_li1, _mm_set1_ps(1.0f/903.3f)); |
|
|
|
|
__m128 v_fy00 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(7.787f), v_y00), _mm_set1_ps(16.0f/116.0f)); |
|
|
|
|
__m128 v_fy01 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(7.787f), v_y01), _mm_set1_ps(16.0f/116.0f)); |
|
|
|
|
|
|
|
|
|
__m128 v_fy10 = _mm_mul_ps(_mm_add_ps(v_li0, _mm_set1_ps(16.0f)), _mm_set1_ps(1.0f/116.0f)); |
|
|
|
|
__m128 v_fy11 = _mm_mul_ps(_mm_add_ps(v_li1, _mm_set1_ps(16.0f)), _mm_set1_ps(1.0f/116.0f)); |
|
|
|
|
__m128 v_y10 = _mm_mul_ps(_mm_mul_ps(v_fy10, v_fy10), v_fy10); |
|
|
|
|
__m128 v_y11 = _mm_mul_ps(_mm_mul_ps(v_fy11, v_fy11), v_fy11); |
|
|
|
|
|
|
|
|
|
__m128 v_cmpli0 = _mm_cmple_ps(v_li0, _mm_set1_ps(lThresh)); |
|
|
|
|
__m128 v_cmpli1 = _mm_cmple_ps(v_li1, _mm_set1_ps(lThresh)); |
|
|
|
|
v_y00 = _mm_and_ps(v_cmpli0, v_y00); |
|
|
|
|
v_y01 = _mm_and_ps(v_cmpli1, v_y01); |
|
|
|
|
v_fy00 = _mm_and_ps(v_cmpli0, v_fy00); |
|
|
|
|
v_fy01 = _mm_and_ps(v_cmpli1, v_fy01); |
|
|
|
|
v_y10 = _mm_andnot_ps(v_cmpli0, v_y10); |
|
|
|
|
v_y11 = _mm_andnot_ps(v_cmpli1, v_y11); |
|
|
|
|
v_fy10 = _mm_andnot_ps(v_cmpli0, v_fy10); |
|
|
|
|
v_fy11 = _mm_andnot_ps(v_cmpli1, v_fy11); |
|
|
|
|
__m128 v_y0 = _mm_or_ps(v_y00, v_y10); |
|
|
|
|
__m128 v_y1 = _mm_or_ps(v_y01, v_y11); |
|
|
|
|
__m128 v_fy0 = _mm_or_ps(v_fy00, v_fy10); |
|
|
|
|
__m128 v_fy1 = _mm_or_ps(v_fy01, v_fy11); |
|
|
|
|
|
|
|
|
|
__m128 v_fxz00 = _mm_add_ps(v_fy0, _mm_mul_ps(v_ai0, _mm_set1_ps(0.002f))); |
|
|
|
|
__m128 v_fxz01 = _mm_add_ps(v_fy1, _mm_mul_ps(v_ai1, _mm_set1_ps(0.002f))); |
|
|
|
|
__m128 v_fxz10 = _mm_sub_ps(v_fy0, _mm_mul_ps(v_bi0, _mm_set1_ps(0.005f))); |
|
|
|
|
__m128 v_fxz11 = _mm_sub_ps(v_fy1, _mm_mul_ps(v_bi1, _mm_set1_ps(0.005f))); |
|
|
|
|
|
|
|
|
|
__m128 v_fxz000 = _mm_mul_ps(_mm_sub_ps(v_fxz00, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f)); |
|
|
|
|
__m128 v_fxz001 = _mm_mul_ps(_mm_sub_ps(v_fxz01, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f)); |
|
|
|
|
__m128 v_fxz010 = _mm_mul_ps(_mm_sub_ps(v_fxz10, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f)); |
|
|
|
|
__m128 v_fxz011 = _mm_mul_ps(_mm_sub_ps(v_fxz11, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f)); |
|
|
|
|
|
|
|
|
|
__m128 v_fxz100 = _mm_mul_ps(_mm_mul_ps(v_fxz00, v_fxz00), v_fxz00); |
|
|
|
|
__m128 v_fxz101 = _mm_mul_ps(_mm_mul_ps(v_fxz01, v_fxz01), v_fxz01); |
|
|
|
|
__m128 v_fxz110 = _mm_mul_ps(_mm_mul_ps(v_fxz10, v_fxz10), v_fxz10); |
|
|
|
|
__m128 v_fxz111 = _mm_mul_ps(_mm_mul_ps(v_fxz11, v_fxz11), v_fxz11); |
|
|
|
|
|
|
|
|
|
__m128 v_cmpfxz00 = _mm_cmple_ps(v_fxz00, _mm_set1_ps(fThresh)); |
|
|
|
|
__m128 v_cmpfxz01 = _mm_cmple_ps(v_fxz01, _mm_set1_ps(fThresh)); |
|
|
|
|
__m128 v_cmpfxz10 = _mm_cmple_ps(v_fxz10, _mm_set1_ps(fThresh)); |
|
|
|
|
__m128 v_cmpfxz11 = _mm_cmple_ps(v_fxz11, _mm_set1_ps(fThresh)); |
|
|
|
|
v_fxz000 = _mm_and_ps(v_cmpfxz00, v_fxz000); |
|
|
|
|
v_fxz001 = _mm_and_ps(v_cmpfxz01, v_fxz001); |
|
|
|
|
v_fxz010 = _mm_and_ps(v_cmpfxz10, v_fxz010); |
|
|
|
|
v_fxz011 = _mm_and_ps(v_cmpfxz11, v_fxz011); |
|
|
|
|
v_fxz100 = _mm_andnot_ps(v_cmpfxz00, v_fxz100); |
|
|
|
|
v_fxz101 = _mm_andnot_ps(v_cmpfxz01, v_fxz101); |
|
|
|
|
v_fxz110 = _mm_andnot_ps(v_cmpfxz10, v_fxz110); |
|
|
|
|
v_fxz111 = _mm_andnot_ps(v_cmpfxz11, v_fxz111); |
|
|
|
|
__m128 v_x0 = _mm_or_ps(v_fxz000, v_fxz100); |
|
|
|
|
__m128 v_x1 = _mm_or_ps(v_fxz001, v_fxz101); |
|
|
|
|
__m128 v_z0 = _mm_or_ps(v_fxz010, v_fxz110); |
|
|
|
|
__m128 v_z1 = _mm_or_ps(v_fxz011, v_fxz111); |
|
|
|
|
|
|
|
|
|
__m128 v_ro0 = _mm_mul_ps(_mm_set1_ps(coeffs[0]), v_x0); |
|
|
|
|
__m128 v_ro1 = _mm_mul_ps(_mm_set1_ps(coeffs[0]), v_x1); |
|
|
|
|
__m128 v_go0 = _mm_mul_ps(_mm_set1_ps(coeffs[3]), v_x0); |
|
|
|
|
__m128 v_go1 = _mm_mul_ps(_mm_set1_ps(coeffs[3]), v_x1); |
|
|
|
|
__m128 v_bo0 = _mm_mul_ps(_mm_set1_ps(coeffs[6]), v_x0); |
|
|
|
|
__m128 v_bo1 = _mm_mul_ps(_mm_set1_ps(coeffs[6]), v_x1); |
|
|
|
|
v_ro0 = _mm_add_ps(v_ro0, _mm_mul_ps(_mm_set1_ps(coeffs[1]), v_y0)); |
|
|
|
|
v_ro1 = _mm_add_ps(v_ro1, _mm_mul_ps(_mm_set1_ps(coeffs[1]), v_y1)); |
|
|
|
|
v_go0 = _mm_add_ps(v_go0, _mm_mul_ps(_mm_set1_ps(coeffs[4]), v_y0)); |
|
|
|
|
v_go1 = _mm_add_ps(v_go1, _mm_mul_ps(_mm_set1_ps(coeffs[4]), v_y1)); |
|
|
|
|
v_bo0 = _mm_add_ps(v_bo0, _mm_mul_ps(_mm_set1_ps(coeffs[7]), v_y0)); |
|
|
|
|
v_bo1 = _mm_add_ps(v_bo1, _mm_mul_ps(_mm_set1_ps(coeffs[7]), v_y1)); |
|
|
|
|
v_ro0 = _mm_add_ps(v_ro0, _mm_mul_ps(_mm_set1_ps(coeffs[2]), v_z0)); |
|
|
|
|
v_ro1 = _mm_add_ps(v_ro1, _mm_mul_ps(_mm_set1_ps(coeffs[2]), v_z1)); |
|
|
|
|
v_go0 = _mm_add_ps(v_go0, _mm_mul_ps(_mm_set1_ps(coeffs[5]), v_z0)); |
|
|
|
|
v_go1 = _mm_add_ps(v_go1, _mm_mul_ps(_mm_set1_ps(coeffs[5]), v_z1)); |
|
|
|
|
v_bo0 = _mm_add_ps(v_bo0, _mm_mul_ps(_mm_set1_ps(coeffs[8]), v_z0)); |
|
|
|
|
v_bo1 = _mm_add_ps(v_bo1, _mm_mul_ps(_mm_set1_ps(coeffs[8]), v_z1)); |
|
|
|
|
|
|
|
|
|
v_li0 = _mm_min_ps(_mm_max_ps(v_ro0, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
v_li1 = _mm_min_ps(_mm_max_ps(v_ro1, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
v_ai0 = _mm_min_ps(_mm_max_ps(v_go0, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
v_ai1 = _mm_min_ps(_mm_max_ps(v_go1, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
v_bi0 = _mm_min_ps(_mm_max_ps(v_bo0, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
v_bi1 = _mm_min_ps(_mm_max_ps(v_bo1, _mm_setzero_ps()), _mm_set1_ps(1.0f)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
void operator()(const float* src, float* dst, int n) const |
|
|
|
|
{ |
|
|
|
|
int i, dcn = dstcn; |
|
|
|
|
int i = 0, dcn = dstcn; |
|
|
|
|
const float* gammaTab = srgb ? sRGBInvGammaTab : 0; |
|
|
|
|
float gscale = GammaTabScale; |
|
|
|
|
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], |
|
|
|
@ -5487,9 +5582,70 @@ struct Lab2RGB_f |
|
|
|
|
float alpha = ColorChannel<float>::max(); |
|
|
|
|
n *= 3; |
|
|
|
|
|
|
|
|
|
static const float lThresh = 0.008856f * 903.3f; |
|
|
|
|
static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f; |
|
|
|
|
for (i = 0; i < n; i += 3, dst += dcn) |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (haveSIMD) |
|
|
|
|
{ |
|
|
|
|
for (; i <= n - 24; i += 24, dst += dcn * 8) |
|
|
|
|
{ |
|
|
|
|
__m128 v_li0 = _mm_loadu_ps(src + i + 0); |
|
|
|
|
__m128 v_li1 = _mm_loadu_ps(src + i + 4); |
|
|
|
|
__m128 v_ai0 = _mm_loadu_ps(src + i + 8); |
|
|
|
|
__m128 v_ai1 = _mm_loadu_ps(src + i + 12); |
|
|
|
|
__m128 v_bi0 = _mm_loadu_ps(src + i + 16); |
|
|
|
|
__m128 v_bi1 = _mm_loadu_ps(src + i + 20); |
|
|
|
|
|
|
|
|
|
_mm_deinterleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1); |
|
|
|
|
|
|
|
|
|
process(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1); |
|
|
|
|
|
|
|
|
|
if (gammaTab) |
|
|
|
|
{ |
|
|
|
|
__m128 v_gscale = _mm_set1_ps(gscale); |
|
|
|
|
v_li0 = _mm_mul_ps(v_li0, v_gscale); |
|
|
|
|
v_li1 = _mm_mul_ps(v_li1, v_gscale); |
|
|
|
|
v_ai0 = _mm_mul_ps(v_ai0, v_gscale); |
|
|
|
|
v_ai1 = _mm_mul_ps(v_ai1, v_gscale); |
|
|
|
|
v_bi0 = _mm_mul_ps(v_bi0, v_gscale); |
|
|
|
|
v_bi1 = _mm_mul_ps(v_bi1, v_gscale); |
|
|
|
|
|
|
|
|
|
splineInterpolate(v_li0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_li1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_ai0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_ai1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_bi0, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
splineInterpolate(v_bi1, gammaTab, GAMMA_TAB_SIZE); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if( dcn == 4 ) |
|
|
|
|
{ |
|
|
|
|
__m128 v_a0 = _mm_set1_ps(alpha); |
|
|
|
|
__m128 v_a1 = _mm_set1_ps(alpha); |
|
|
|
|
_mm_interleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1, v_a0, v_a1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(dst + 0, v_li0); |
|
|
|
|
_mm_storeu_ps(dst + 4, v_li1); |
|
|
|
|
_mm_storeu_ps(dst + 8, v_ai0); |
|
|
|
|
_mm_storeu_ps(dst + 12, v_ai1); |
|
|
|
|
_mm_storeu_ps(dst + 16, v_bi0); |
|
|
|
|
_mm_storeu_ps(dst + 20, v_bi1); |
|
|
|
|
_mm_storeu_ps(dst + 24, v_a0); |
|
|
|
|
_mm_storeu_ps(dst + 28, v_a1); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
_mm_interleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(dst + 0, v_li0); |
|
|
|
|
_mm_storeu_ps(dst + 4, v_li1); |
|
|
|
|
_mm_storeu_ps(dst + 8, v_ai0); |
|
|
|
|
_mm_storeu_ps(dst + 12, v_ai1); |
|
|
|
|
_mm_storeu_ps(dst + 16, v_bi0); |
|
|
|
|
_mm_storeu_ps(dst + 20, v_bi1); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for (; i < n; i += 3, dst += dcn) |
|
|
|
|
{ |
|
|
|
|
float li = src[i]; |
|
|
|
|
float ai = src[i + 1]; |
|
|
|
@ -5540,6 +5696,11 @@ struct Lab2RGB_f |
|
|
|
|
int dstcn; |
|
|
|
|
float coeffs[9]; |
|
|
|
|
bool srgb; |
|
|
|
|
float lThresh; |
|
|
|
|
float fThresh; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
bool haveSIMD; |
|
|
|
|
#endif |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#undef clip |
|
|
|
|