From 82811d070659020b703d02ceeed79540d43a6c82 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Wed, 5 Jul 2017 20:36:30 +0300 Subject: [PATCH 1/4] Luv: singularities fixed --- modules/imgproc/src/color.cpp | 86 +++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 488bdc1cc5..4f27671ff3 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -6495,7 +6495,7 @@ struct RGB2Luv_f coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f ); } - float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3); + float d = 1.f/std::max(whitept[0] + whitept[1]*15 + whitept[2]*3, FLT_EPSILON); un = 4*whitept[0]*d*13; vn = 9*whitept[1]*d*13; @@ -6755,9 +6755,9 @@ struct Luv2RGB_f coeffs[i+blueIdx*3] = _coeffs[i+6]; } - float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3); - un = 4*whitept[0]*d; - vn = 9*whitept[1]*d; + float d = 1.f/std::max(whitept[0] + whitept[1]*15 + whitept[2]*3, FLT_EPSILON); + un = 4*13*whitept[0]*d; + vn = 9*13*whitept[1]*d; #if CV_SSE2 haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif @@ -6769,23 +6769,42 @@ struct Luv2RGB_f void process(__m128& v_l0, __m128& v_l1, __m128& v_u0, __m128& v_u1, __m128& v_v0, __m128& v_v1) const { - __m128 v_y0 = _mm_mul_ps(_mm_add_ps(v_l0, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f)); - __m128 v_y1 = _mm_mul_ps(_mm_add_ps(v_l1, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f)); - v_y0 = _mm_mul_ps(_mm_mul_ps(v_y0, v_y0), v_y0); - v_y1 = _mm_mul_ps(_mm_mul_ps(v_y1, v_y1), v_y1); - __m128 v_d0 = _mm_div_ps(_mm_set1_ps(1.f/13.f), v_l0); - __m128 v_d1 = _mm_div_ps(_mm_set1_ps(1.f/13.f), v_l1); - v_u0 = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(v_u0, v_d0), _mm_set1_ps(un)), _mm_set1_ps(3.f)); - v_u1 = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(v_u1, v_d1), _mm_set1_ps(un)), _mm_set1_ps(3.f)); - v_v0 = _mm_add_ps(_mm_mul_ps(v_v0, v_d0), _mm_set1_ps(vn)); - v_v1 = _mm_add_ps(_mm_mul_ps(v_v1, v_d1), _mm_set1_ps(vn)); - __m128 v_iv0 = _mm_div_ps(_mm_set1_ps(0.25f), v_v0); - __m128 v_iv1 = _mm_div_ps(_mm_set1_ps(0.25f), v_v1); - __m128 v_x0 = _mm_mul_ps(_mm_mul_ps(_mm_set1_ps(3.f), v_u0), v_iv0); - __m128 v_x1 = _mm_mul_ps(_mm_mul_ps(_mm_set1_ps(3.f), v_u1), v_iv1); - __m128 v_z0 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(_mm_set1_ps(12.f), v_u0), _mm_mul_ps(_mm_set1_ps(20.f), v_v0)), v_iv0); - __m128 v_z1 = _mm_mul_ps(_mm_sub_ps(_mm_sub_ps(_mm_set1_ps(12.f), v_u1), _mm_mul_ps(_mm_set1_ps(20.f), v_v1)), v_iv1); - + // L*(3./29.)^3 + __m128 v_y00 = _mm_mul_ps(v_l0, _mm_set1_ps(1.0f/903.3f)); + __m128 v_y01 = _mm_mul_ps(v_l1, _mm_set1_ps(1.0f/903.3f)); + // ((L + 16)/116)^3 + __m128 v_y10 = _mm_mul_ps(_mm_add_ps(v_l0, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f)); + __m128 v_y11 = _mm_mul_ps(_mm_add_ps(v_l1, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f)); + v_y10 = _mm_mul_ps(_mm_mul_ps(v_y10, v_y10), v_y10); + v_y11 = _mm_mul_ps(_mm_mul_ps(v_y11, v_y11), v_y11); + // Y = (L <= 8) ? Y0 : Y1; + __m128 v_cmpl0 = _mm_cmplt_ps(v_l0, _mm_set1_ps(8.f)); + __m128 v_cmpl1 = _mm_cmplt_ps(v_l1, _mm_set1_ps(8.f)); + v_y00 = _mm_and_ps(v_cmpl0, v_y00); + v_y01 = _mm_and_ps(v_cmpl1, v_y01); + v_y10 = _mm_andnot_ps(v_cmpl0, v_y10); + v_y11 = _mm_andnot_ps(v_cmpl1, v_y11); + __m128 v_y0 = _mm_or_ps(v_y00, v_y10); + __m128 v_y1 = _mm_or_ps(v_y01, v_y11); + // up = 3*(u + L*_un); + __m128 v_up0 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_add_ps(v_u0, _mm_mul_ps(v_l0, _mm_set1_ps(un)))); + __m128 v_up1 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_add_ps(v_u1, _mm_mul_ps(v_l1, _mm_set1_ps(un)))); + // vp = 0.25/(v + L*_vn); + __m128 v_vp0 = _mm_div_ps(_mm_set1_ps(0.25f), _mm_add_ps(v_v0, _mm_mul_ps(v_l0, _mm_set1_ps(vn)))); + __m128 v_vp1 = _mm_div_ps(_mm_set1_ps(0.25f), _mm_add_ps(v_v1, _mm_mul_ps(v_l1, _mm_set1_ps(vn)))); + // vp = max(-0.25, min(0.25, vp)); + v_vp0 = _mm_max_ps(v_vp0, _mm_set1_ps(-0.25f)); + v_vp1 = _mm_max_ps(v_vp1, _mm_set1_ps(-0.25f)); + v_vp0 = _mm_min_ps(v_vp0, _mm_set1_ps( 0.25f)); + v_vp1 = _mm_min_ps(v_vp1, _mm_set1_ps( 0.25f)); + //X = 3*up*vp; // (*Y) is done later + __m128 v_x0 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_mul_ps(v_up0, v_vp0)); + __m128 v_x1 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_mul_ps(v_up1, v_vp1)); + //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later + __m128 v_z0 = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(12.f*13.f), v_l0), v_up0), v_vp0), _mm_set1_ps(5.f)); + __m128 v_z1 = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(12.f*13.f), v_l1), v_up1), v_vp1), _mm_set1_ps(5.f)); + + // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done v_l0 = _mm_mul_ps(v_x0, _mm_set1_ps(coeffs[0])); v_l1 = _mm_mul_ps(v_x1, _mm_set1_ps(coeffs[0])); v_u0 = _mm_mul_ps(v_x0, _mm_set1_ps(coeffs[3])); @@ -6902,15 +6921,22 @@ struct Luv2RGB_f #endif for( ; i < n; i += 3, dst += dcn ) { - float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z; - Y = (L + 16.f) * (1.f/116.f); - Y = Y*Y*Y; - d = (1.f/13.f)/L; - u = u*d + _un; - v = v*d + _vn; - float iv = 1.f/v; - X = 2.25f * u * Y * iv ; - Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv; + float L = src[i], u = src[i+1], v = src[i+2], X, Y, Z; + if(L >= 8) + { + Y = (L + 16.f) * (1.f/116.f); + Y = Y*Y*Y; + } + else + { + Y = L * (1.0f/903.3f); // L*(3./29.)^3 + } + float up = 3.f*(u + L*_un); + float vp = 0.25f/(v + L*_vn); + if(vp > 0.25f) vp = 0.25f; + if(vp < -0.25f) vp = -0.25f; + X = Y*3.f*up*vp; + Z = Y*(((12.f*13.f)*L - up)*vp - 5.f); float R = X*C0 + Y*C1 + Z*C2; float G = X*C3 + Y*C4 + Z*C5; From 6c71988c54b3531e61aa026c9df7949edf992819 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Wed, 5 Jul 2017 21:06:14 +0300 Subject: [PATCH 2/4] RGB2Luv_f: R, G, B limited to [0, 1] --- modules/imgproc/src/color.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 4f27671ff3..3c2af954f9 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -6607,6 +6607,7 @@ struct RGB2Luv_f for( ; i <= n - 12; i += 12, src += scn * 4 ) { float32x4x3_t v_src = vld3q_f32(src); + if( gammaTab ) { v_src.val[0] = vmulq_f32(v_src.val[0], vdupq_n_f32(gscale)); @@ -6627,6 +6628,15 @@ struct RGB2Luv_f for( ; i <= n - 12; i += 12, src += scn * 4 ) { float32x4x4_t v_src = vld4q_f32(src); + + v_src.val[0] = vmaxq_f32(v_src.val[0], vdupq_n_f32(0)); + v_src.val[1] = vmaxq_f32(v_src.val[1], vdupq_n_f32(0)); + v_src.val[2] = vmaxq_f32(v_src.val[2], vdupq_n_f32(0)); + + v_src.val[0] = vminq_f32(v_src.val[0], vdupq_n_f32(1)); + v_src.val[1] = vminq_f32(v_src.val[1], vdupq_n_f32(1)); + v_src.val[2] = vminq_f32(v_src.val[2], vdupq_n_f32(1)); + if( gammaTab ) { v_src.val[0] = vmulq_f32(v_src.val[0], vdupq_n_f32(gscale)); @@ -6670,6 +6680,20 @@ struct RGB2Luv_f _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); } + v_r0 = _mm_max_ps(v_r0, _mm_setzero_ps()); + v_r1 = _mm_max_ps(v_r1, _mm_setzero_ps()); + v_g0 = _mm_max_ps(v_g0, _mm_setzero_ps()); + v_g1 = _mm_max_ps(v_g1, _mm_setzero_ps()); + v_b0 = _mm_max_ps(v_b0, _mm_setzero_ps()); + v_b1 = _mm_max_ps(v_b1, _mm_setzero_ps()); + + v_r0 = _mm_min_ps(v_r0, _mm_set1_ps(1.f)); + v_r1 = _mm_min_ps(v_r1, _mm_set1_ps(1.f)); + v_g0 = _mm_min_ps(v_g0, _mm_set1_ps(1.f)); + v_g1 = _mm_min_ps(v_g1, _mm_set1_ps(1.f)); + v_b0 = _mm_min_ps(v_b0, _mm_set1_ps(1.f)); + v_b1 = _mm_min_ps(v_b1, _mm_set1_ps(1.f)); + if ( gammaTab ) { __m128 v_gscale = _mm_set1_ps(gscale); @@ -6704,6 +6728,9 @@ struct RGB2Luv_f for( ; i < n; i += 3, src += scn ) { float R = src[0], G = src[1], B = src[2]; + R = std::min(std::max(R, 0.f), 1.f); + G = std::min(std::max(G, 0.f), 1.f); + B = std::min(std::max(B, 0.f), 1.f); if( gammaTab ) { R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE); From 704c688225283e9148742187ad517738bf77c681 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Wed, 5 Jul 2017 21:52:13 +0300 Subject: [PATCH 3/4] OCL code fixed, fix for NEON added --- modules/imgproc/src/color.cpp | 16 ++++++--- modules/imgproc/src/opencl/cvtcolor.cl | 50 +++++++++++++++++--------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 3c2af954f9..f8cd0cb780 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -6608,6 +6608,14 @@ struct RGB2Luv_f { float32x4x3_t v_src = vld3q_f32(src); + v_src.val[0] = vmaxq_f32(v_src.val[0], vdupq_n_f32(0)); + v_src.val[1] = vmaxq_f32(v_src.val[1], vdupq_n_f32(0)); + v_src.val[2] = vmaxq_f32(v_src.val[2], vdupq_n_f32(0)); + + v_src.val[0] = vminq_f32(v_src.val[0], vdupq_n_f32(1)); + v_src.val[1] = vminq_f32(v_src.val[1], vdupq_n_f32(1)); + v_src.val[2] = vminq_f32(v_src.val[2], vdupq_n_f32(1)); + if( gammaTab ) { v_src.val[0] = vmulq_f32(v_src.val[0], vdupq_n_f32(gscale)); @@ -8574,7 +8582,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) ); } - float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3); + float d = 1.f/std::max(_whitept[0] + _whitept[1]*15 + _whitept[2]*3, FLT_EPSILON); un = 13*4*_whitept[0]*d; vn = 13*9*_whitept[1]*d; @@ -8641,9 +8649,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1); } - float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3); - un = 4*_whitept[0]*d; - vn = 9*_whitept[1]*d; + float d = 1.f/std::max(_whitept[0] + _whitept[1]*15 + _whitept[2]*3, FLT_EPSILON); + un = 4*13*_whitept[0]*d; + vn = 9*13*_whitept[1]*d; Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs); } diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index 2193541aec..7c2e519753 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -1963,6 +1963,10 @@ __kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offse float R = src[0], G = src[1], B = src[2]; + R = clamp(R, 0.f, 1.f); + G = clamp(G, 0.f, 1.f); + B = clamp(B, 0.f, 1.f); + #ifdef SRGB R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); @@ -2067,15 +2071,21 @@ __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offse __global const float * src = (__global const float *)(srcptr + src_index); __global float * dst = (__global float *)(dstptr + dst_index); - float L = src[0], u = src[1], v = src[2], d, X, Y, Z; - Y = (L + 16.f) * (1.f/116.f); - Y = Y*Y*Y; - d = (1.f/13.f)/L; - u = fma(u, d, _un); - v = fma(v, d, _vn); - float iv = 1.f/v; - X = 2.25f * u * Y * iv; - Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv; + float L = src[0], u = src[1], v = src[2], X, Y, Z; + if(L >= 8) + { + Y = fma(L, 1.f/116.f, 16.f/116.f); + Y = Y*Y*Y; + } + else + { + Y = L * (1.0f/903.3f); // L*(3./29.)^3 + } + float up = 3.f*fma(L, _un, u); + float vp = 0.25f/fma(L, _vn, v); + vp = clamp(vp, -0.25f, 0.25f); + X = 3.f*Y*up*vp; + Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f); float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2])); float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5])); @@ -2129,14 +2139,20 @@ __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset, float L = src[0]*(100.f/255.f); float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f); float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f); - Y = (L + 16.f) * (1.f/116.f); - Y = Y*Y*Y; - d = (1.f/13.f)/L; - u = fma(u, d, _un); - v = fma(v, d, _vn); - float iv = 1.f/v; - X = 2.25f * u * Y * iv ; - Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv; + if(L >= 8) + { + Y = fma(L, 1.f/116.f, 16.f/116.f); + Y = Y*Y*Y; + } + else + { + Y = L * (1.0f/903.3f); // L*(3./29.)^3 + } + float up = 3.f*fma(L, _un, u); + float vp = 0.25f/fma(L, _vn, v); + vp = clamp(vp, -0.25f, 0.25f); + X = 3.f*Y*up*vp; + Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f); float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2])); float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5])); From aa621d6f3c3fa89b965ae33d8d84734196de0717 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 6 Jul 2017 00:23:58 +0300 Subject: [PATCH 4/4] magic constants explained --- modules/imgproc/src/color.cpp | 4 ++++ modules/imgproc/src/opencl/cvtcolor.cl | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f8cd0cb780..b40726dd96 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -7011,6 +7011,8 @@ struct RGB2Luv_b const float* _whitept, bool _srgb ) : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) { + //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134) + //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122) #if CV_NEON v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(2.55f); @@ -7211,6 +7213,8 @@ struct Luv2RGB_b const float* _whitept, bool _srgb ) : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) { + // 1.388235294117647 = (220+134)/255 + // 1.027450980392157 = (140+122)/255 #if CV_NEON v_scale_inv = vdupq_n_f32(100.f/255.f); v_coeff1 = vdupq_n_f32(1.388235294117647f); diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index 7c2e519753..3fd1ad1250 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -2035,7 +2035,9 @@ __kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset, float v = L*fma(2.25f, Y*d, -_vn); dst[0] = SAT_CAST(L * 2.55f); + //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134) dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f)); + //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122) dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f)); ++y; @@ -2137,7 +2139,9 @@ __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset, { float d, X, Y, Z; float L = src[0]*(100.f/255.f); + // 1.388235294117647 = (220+134)/255 float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f); + // 1.027450980392157 = (140+122)/255 float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f); if(L >= 8) {