diff --git a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp index a3a9605237..f57f84cdeb 100644 --- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp @@ -678,7 +678,7 @@ PERF_TEST_P_(RGB2YUVPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -716,7 +716,7 @@ PERF_TEST_P_(YUV2RGBPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -864,7 +864,7 @@ PERF_TEST_P_(BGR2YUVPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); @@ -894,7 +894,7 @@ PERF_TEST_P_(YUV2BGRPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); diff --git a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp index 010582bbc6..f0846a7dee 100644 --- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp @@ -43,4 +43,24 @@ namespace opencv_test Values(szVGA, sz720p, sz1080p), Values(cv::compile_args(IMGPROC_FLUID)))); + INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + + INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + + INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + + INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + } diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index 8137a84b51..01d166472e 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -126,19 +126,7 @@ static void run_rgb2yuv(Buffer &dst, const View &src, const float coef[5]) int width = dst.length(); - // TODO: Vectorize for SIMD - for (int w=0; w < width; w++) - { - uchar r = in[3*w ]; - uchar g = in[3*w + 1]; - uchar b = in[3*w + 2]; - float y = coef[0]*r + coef[1]*g + coef[2]*b; - float u = coef[3]*(b - y) + 128; - float v = coef[4]*(r - y) + 128; - out[3*w ] = saturate(y, roundf); - out[3*w + 1] = saturate(u, roundf); - out[3*w + 2] = saturate(v, roundf); - } + run_rgb2yuv_impl(out, in, width, coef); } static void run_yuv2rgb(Buffer &dst, const View &src, const float coef[4]) @@ -154,19 +142,7 @@ static void run_yuv2rgb(Buffer &dst, const View &src, const float coef[4]) int width = dst.length(); - // TODO: Vectorize for SIMD - for (int w=0; w < width; w++) - { - uchar y = in[3*w ]; - int u = in[3*w + 1] - 128; - int v = in[3*w + 2] - 128; - float r = y + coef[0]*v; - float g = y + coef[1]*u + coef[2]*v; - float b = y + coef[3]*u; - out[3*w ] = saturate(r, roundf); - out[3*w + 1] = saturate(g, roundf); - out[3*w + 2] = saturate(b, roundf); - } + run_yuv2rgb_impl(out, in, width, coef); } GAPI_FLUID_KERNEL(GFluidRGB2YUV, cv::gapi::imgproc::GRGB2YUV, false) diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp index d933c335e6..9b217903ef 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp @@ -41,6 +41,22 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width, CV_CPU_DISPATCH_MODES_ALL); } +//-------------------------------------- +// +// Fluid kernels: RGB-to-YUV, YUV-to-RGB +// +//-------------------------------------- + +void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]) +{ + CV_CPU_DISPATCH(run_rgb2yuv_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL); +} + +void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]) +{ + CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL); +} + //--------------------- // // Fluid kernels: Sobel diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp index a3b7413ecb..1b6f1b8c0d 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp @@ -23,6 +23,16 @@ namespace fluid { void run_rgb2gray_impl(uchar out[], const uchar in[], int width, float coef_r, float coef_g, float coef_b); +//-------------------------------------- +// +// Fluid kernels: RGB-to-YUV, YUV-to-RGB +// +//-------------------------------------- + +void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]); + +void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); + //--------------------- // // Fluid kernels: Sobel diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index 2040e19911..c87be085a3 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -21,6 +21,8 @@ # pragma GCC diagnostic ignored "-Wstrict-overflow" #endif +using cv::gapi::own::saturate; + namespace cv { namespace gapi { namespace fluid { @@ -36,6 +38,16 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN void run_rgb2gray_impl(uchar out[], const uchar in[], int width, float coef_r, float coef_g, float coef_b); +//-------------------------------------- +// +// Fluid kernels: RGB-to-YUV, YUV-to-RGB +// +//-------------------------------------- + +void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]); + +void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); + //--------------------- // // Fluid kernels: Sobel @@ -142,6 +154,161 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width, } } +//-------------------------------------- +// +// Fluid kernels: RGB-to-YUV, YUV-to-RGB +// +//-------------------------------------- + +void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]) +{ + ushort c0 = static_cast(coef[0]*(1 << 16) + 0.5f); // Q0.0.16 un-signed + ushort c1 = static_cast(coef[1]*(1 << 16) + 0.5f); + ushort c2 = static_cast(coef[2]*(1 << 16) + 0.5f); + short c3 = static_cast(coef[3]*(1 << 12) + 0.5f); // Q1.0.12 signed + short c4 = static_cast(coef[4]*(1 << 12) + 0.5f); + + int w = 0; + +#if CV_SIMD + static const int nlanes = v_uint8::nlanes; + for ( ; w <= width - nlanes; w += nlanes) + { + v_uint8 r, g, b; + v_load_deinterleave(&in[3*w], r, g, b); + + v_uint16 _r0, _r1, _g0, _g1, _b0, _b1; + v_expand(r, _r0, _r1); + v_expand(g, _g0, _g1); + v_expand(b, _b0, _b1); + + _r0 = _r0 << 7; // Q0.9.7 un-signed + _r1 = _r1 << 7; + _g0 = _g0 << 7; + _g1 = _g1 << 7; + _b0 = _b0 << 7; + _b1 = _b1 << 7; + + v_uint16 _y0, _y1; + _y0 = v_mul_hi(vx_setall_u16(c0), _r0) // Q0.9.7 + + v_mul_hi(vx_setall_u16(c1), _g0) + + v_mul_hi(vx_setall_u16(c2), _b0); + _y1 = v_mul_hi(vx_setall_u16(c0), _r1) + + v_mul_hi(vx_setall_u16(c1), _g1) + + v_mul_hi(vx_setall_u16(c2), _b1); + + v_int16 r0, r1, b0, b1, y0, y1; + r0 = v_reinterpret_as_s16(_r0); // Q1.8.7 signed + r1 = v_reinterpret_as_s16(_r1); + b0 = v_reinterpret_as_s16(_b0); + b1 = v_reinterpret_as_s16(_b1); + y0 = v_reinterpret_as_s16(_y0); + y1 = v_reinterpret_as_s16(_y1); + + v_int16 u0, u1, v0, v1; + u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0); // Q1.12.3 + u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1); + v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0); + v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1); + + v_uint8 y, u, v; + y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7, + (_y1 + vx_setall_u16(1 << 6)) >> 7); + u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3, // 257 << 2 = 128.5 * (1 << 3) + (u1 + vx_setall_s16(257 << 2)) >> 3); + v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3, + (v1 + vx_setall_s16(257 << 2)) >> 3); + + v_store_interleave(&out[3*w], y, u, v); + } +#endif + + for ( ; w < width; w++) + { + short r = in[3*w ] << 7; // Q1.8.7 signed + short g = in[3*w + 1] << 7; + short b = in[3*w + 2] << 7; + short y = (c0*r + c1*g + c2*b) >> 16; // Q1.8.7 + short u = c3*(b - y) >> 16; // Q1.12.3 + short v = c4*(r - y) >> 16; + out[3*w ] = static_cast((y + (1 << 6)) >> 7); + out[3*w + 1] = saturate((u + (128 << 3) + (1 << 2)) >> 3); + out[3*w + 2] = saturate((v + (128 << 3) + (1 << 2)) >> 3); + } +} + +void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]) +{ + short c0 = static_cast(coef[0] * (1 << 12) + 0.5f); // Q1.3.12 + short c1 = static_cast(coef[1] * (1 << 12) + 0.5f); + short c2 = static_cast(coef[2] * (1 << 12) + 0.5f); + short c3 = static_cast(coef[3] * (1 << 12) + 0.5f); + + int w = 0; + +#if CV_SIMD + static const int nlanes = v_uint8::nlanes; + for ( ; w <= width - nlanes; w += nlanes) + { + v_uint8 y, u, v; + v_load_deinterleave(&in[3*w], y, u, v); + + v_uint16 _y0, _y1, _u0, _u1, _v0, _v1; + v_expand(y, _y0, _y1); + v_expand(u, _u0, _u1); + v_expand(v, _v0, _v1); + + v_int16 y0, y1, u0, u1, v0, v1; + y0 = v_reinterpret_as_s16(_y0); + y1 = v_reinterpret_as_s16(_y1); + u0 = v_reinterpret_as_s16(_u0); + u1 = v_reinterpret_as_s16(_u1); + v0 = v_reinterpret_as_s16(_v0); + v1 = v_reinterpret_as_s16(_v1); + + y0 = y0 << 3; // Q1.12.3 + y1 = y1 << 3; + u0 = (u0 - vx_setall_s16(128)) << 7; // Q1.8.7 + u1 = (u1 - vx_setall_s16(128)) << 7; + v0 = (v0 - vx_setall_s16(128)) << 7; + v1 = (v1 - vx_setall_s16(128)) << 7; + + v_int16 r0, r1, g0, g1, b0, b1; + r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0); // Q1.12.3 + r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1); + g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0) + + v_mul_hi(vx_setall_s16(c2), v0); + g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1) + + v_mul_hi(vx_setall_s16(c2), v1); + b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0); + b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1); + + v_uint8 r, g, b; + r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3, + (r1 + vx_setall_s16(1 << 2)) >> 3); + g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3, + (g1 + vx_setall_s16(1 << 2)) >> 3); + b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3, + (b1 + vx_setall_s16(1 << 2)) >> 3); + + v_store_interleave(&out[3*w], r, g, b); + } +#endif + + for ( ; w < width; w++) + { + short y = in[3*w ] << 3; // Q1.12.3 + short u = (in[3*w + 1] - 128) << 7; // Q1.8.7 + short v = (in[3*w + 2] - 128) << 7; + short r = y + ( c0*v >> 16); // Q1.12.3 + short g = y + ((c1*u + c2*v) >> 16); + short b = y + ((c3*u ) >> 16); + out[3*w ] = saturate((r + (1 << 2)) >> 3); + out[3*w + 1] = saturate((g + (1 << 2)) >> 3); + out[3*w + 2] = saturate((b + (1 << 2)) >> 3); + } +} + //--------------------- // // Fluid kernels: Sobel