From b7bb79c7c8d20900cce3b1b98a38a3e81dfaec49 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Thu, 13 Dec 2018 14:53:48 +0300 Subject: [PATCH 01/13] videoio(MSMF): backport WITH_MSMF_DXVA flag --- CMakeLists.txt | 6 +++++- cmake/OpenCVFindLibsVideo.cmake | 15 ++++++++------- modules/videoio/CMakeLists.txt | 6 +++--- modules/videoio/src/cap_msmf.cpp | 20 ++++++++++---------- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3206edb179..fcfef8940b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,9 +357,12 @@ OCV_OPTION(WITH_LIBV4L "Use libv4l for Video 4 Linux support" OFF OCV_OPTION(WITH_DSHOW "Build VideoIO with DirectShow support" ON VISIBLE_IF WIN32 AND NOT ARM AND NOT WINRT VERIFY HAVE_DSHOW) -OCV_OPTION(WITH_MSMF "Build VideoIO with Media Foundation support" ON +OCV_OPTION(WITH_MSMF "Build VideoIO with Media Foundation support" NOT MINGW VISIBLE_IF WIN32 VERIFY HAVE_MSMF) +OCV_OPTION(WITH_MSMF_DXVA "Enable hardware acceleration in Media Foundation backend" WITH_MSMF + VISIBLE_IF WIN32 + VERIFY HAVE_MSMF_DXVA) OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF VISIBLE_IF NOT ANDROID AND NOT WINRT VERIFY HAVE_XIMEA) @@ -1478,6 +1481,7 @@ endif() if(WITH_MSMF OR HAVE_MSMF) status(" Media Foundation:" HAVE_MSMF THEN YES ELSE NO) + status(" DXVA:" HAVE_MSMF_DXVA THEN YES ELSE NO) endif() if(WITH_XIMEA OR HAVE_XIMEA) diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index b9d15c38b8..c9f593fbda 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -274,14 +274,15 @@ endif(WITH_DSHOW) ocv_clear_vars(HAVE_MSMF) if(WITH_MSMF) check_include_file(Mfapi.h HAVE_MSMF) - check_include_file(D3D11.h D3D11_found) - check_include_file(D3d11_4.h D3D11_4_found) - if(D3D11_found AND D3D11_4_found) - set(HAVE_DXVA YES) - else() - set(HAVE_DXVA NO) + set(HAVE_MSMF_DXVA "") + if(WITH_MSMF_DXVA) + check_include_file(D3D11.h D3D11_found) + check_include_file(D3d11_4.h D3D11_4_found) + if(D3D11_found AND D3D11_4_found) + set(HAVE_MSMF_DXVA YES) + endif() endif() -endif(WITH_MSMF) +endif() # --- Extra HighGUI and VideoIO libs on Windows --- if(WIN32) diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt index f5eba046c5..08c3967706 100644 --- a/modules/videoio/CMakeLists.txt +++ b/modules/videoio/CMakeLists.txt @@ -85,11 +85,11 @@ if (WIN32 AND HAVE_DSHOW) endif() endif() -if (WIN32 AND HAVE_MSMF) +if(WIN32 AND HAVE_MSMF) list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_msmf.hpp) list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_msmf.cpp) - if (HAVE_DXVA) - add_definitions(-DHAVE_DXVA) + if(HAVE_MSMF_DXVA) + add_definitions(-DHAVE_MSMF_DXVA) endif() endif() diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp index 3eca95bfa6..0aea9d5a58 100644 --- a/modules/videoio/src/cap_msmf.cpp +++ b/modules/videoio/src/cap_msmf.cpp @@ -55,15 +55,15 @@ #include #include #include -#include +#include #include #include #include #include #include -#ifdef HAVE_DXVA -#include -#include +#ifdef HAVE_MSMF_DXVA +#include +#include #endif #include #include @@ -81,7 +81,7 @@ #pragma comment(lib, "mfuuid") #pragma comment(lib, "Strmiids") #pragma comment(lib, "Mfreadwrite") -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA #pragma comment(lib, "d3d11") // MFCreateDXGIDeviceManager() is available since Win8 only. // To avoid OpenCV loading failure on Win7 use dynamic detection of this symbol. @@ -710,7 +710,7 @@ protected: cv::String filename; int camid; MSMFCapture_Mode captureMode; -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA _ComPtr D3DDev; _ComPtr D3DMgr; #endif @@ -735,7 +735,7 @@ CvCapture_MSMF::CvCapture_MSMF(): filename(""), camid(-1), captureMode(MODE_SW), -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA D3DDev(NULL), D3DMgr(NULL), #endif @@ -774,7 +774,7 @@ void CvCapture_MSMF::close() bool CvCapture_MSMF::configureHW(bool enable) { -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA if ((enable && D3DMgr && D3DDev) || (!enable && !D3DMgr && !D3DDev)) return true; if (!pMFCreateDXGIDeviceManager_initialized) @@ -971,7 +971,7 @@ bool CvCapture_MSMF::open(int _index) SUCCEEDED(srAttr->SetUINT32(MF_SOURCE_READER_ENABLE_VIDEO_PROCESSING, FALSE)) && SUCCEEDED(srAttr->SetUINT32(MF_SOURCE_READER_ENABLE_ADVANCED_VIDEO_PROCESSING, TRUE))) { -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA if (D3DMgr) srAttr->SetUnknown(MF_SOURCE_READER_D3D_MANAGER, D3DMgr.Get()); #endif @@ -1022,7 +1022,7 @@ bool CvCapture_MSMF::open(const cv::String& _filename) SUCCEEDED(srAttr->SetUINT32(MF_SOURCE_READER_ENABLE_ADVANCED_VIDEO_PROCESSING, true)) ) { -#ifdef HAVE_DXVA +#ifdef HAVE_MSMF_DXVA if(D3DMgr) srAttr->SetUnknown(MF_SOURCE_READER_D3D_MANAGER, D3DMgr.Get()); #endif From c1e9a7ee4b3631b82f9119c4a936936d186b3f10 Mon Sep 17 00:00:00 2001 From: Madan Ram Date: Fri, 14 Dec 2018 16:34:58 +0530 Subject: [PATCH 02/13] Update template_matching.markdown Replaced CV_TM_SQDIFF to TM_SQDIFF and the rest since methods are renamed in opencv 3.4 --- .../template_matching/template_matching.markdown | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown b/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown index c5f22330cf..4567775ba6 100644 --- a/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown +++ b/doc/tutorials/imgproc/histograms/template_matching/template_matching.markdown @@ -65,7 +65,7 @@ that should be used to find the match. -# **Mask image (M):** The mask, a grayscale image that masks the template -- Only two matching methods currently accept a mask: CV_TM_SQDIFF and CV_TM_CCORR_NORMED (see +- Only two matching methods currently accept a mask: TM_SQDIFF and TM_CCORR_NORMED (see below for explanation of all the matching methods available in opencv). @@ -86,23 +86,23 @@ that should be used to find the match. Good question. OpenCV implements Template matching in the function **matchTemplate()**. The available methods are 6: --# **method=CV_TM_SQDIFF** +-# **method=TM_SQDIFF** \f[R(x,y)= \sum _{x',y'} (T(x',y')-I(x+x',y+y'))^2\f] --# **method=CV_TM_SQDIFF_NORMED** +-# **method=TM_SQDIFF_NORMED** \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y')-I(x+x',y+y'))^2}{\sqrt{\sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f] --# **method=CV_TM_CCORR** +-# **method=TM_CCORR** \f[R(x,y)= \sum _{x',y'} (T(x',y') \cdot I(x+x',y+y'))\f] --# **method=CV_TM_CCORR_NORMED** +-# **method=TM_CCORR_NORMED** \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y'))}{\sqrt{\sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f] --# **method=CV_TM_CCOEFF** +-# **method=TM_CCOEFF** \f[R(x,y)= \sum _{x',y'} (T'(x',y') \cdot I'(x+x',y+y'))\f] @@ -110,7 +110,7 @@ available methods are 6: \f[\begin{array}{l} T'(x',y')=T(x',y') - 1/(w \cdot h) \cdot \sum _{x'',y''} T(x'',y'') \\ I'(x+x',y+y')=I(x+x',y+y') - 1/(w \cdot h) \cdot \sum _{x'',y''} I(x+x'',y+y'') \end{array}\f] --# **method=CV_TM_CCOEFF_NORMED** +-# **method=TM_CCOEFF_NORMED** \f[R(x,y)= \frac{ \sum_{x',y'} (T'(x',y') \cdot I'(x+x',y+y')) }{ \sqrt{\sum_{x',y'}T'(x',y')^2 \cdot \sum_{x',y'} I'(x+x',y+y')^2} }\f] From d99a4af22997aafd3543cb016bfe8a65e78d10fb Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Fri, 14 Dec 2018 17:01:01 +0300 Subject: [PATCH 03/13] Merge pull request #13379 from savuor:color_5x5 RGB to/from Gray rewritten to wide intrinsics (#13379) * 5x5 to RGB added * RGB25x5 added * Gray2RGB added * Gray2RGB5x5 added * vx_set moved out of loops * RGB5x52Gray added * RGB2Gray written * warnings fixed (int -> (u)short conversion) * warning fixed * warning fixed * "i < n-vsize+1" to "i <= n-vsize" * RGBA2mRGBA vectorized * try to fix ARM builds * fixed ARM build for RGB2RGB5x5 * mRGBA2RGBA: saturation, vectorization * fixed CL implementation of mRGBA2RGBA (saturation added) --- modules/imgproc/src/color_rgb.cpp | 1359 +++++++++++------------ modules/imgproc/src/opencl/color_rgb.cl | 7 +- 2 files changed, 639 insertions(+), 727 deletions(-) diff --git a/modules/imgproc/src/color_rgb.cpp b/modules/imgproc/src/color_rgb.cpp index b027e31fc9..9245f26d05 100644 --- a/modules/imgproc/src/color_rgb.cpp +++ b/modules/imgproc/src/color_rgb.cpp @@ -80,7 +80,7 @@ struct RGB2RGB #if CV_SIMD const int vsize = vt::nlanes; - for(; i < n-vsize+1; + for(; i <= n-vsize; i += vsize, src += vsize*scn, dst += vsize*dcn) { vt a, b, c, d; @@ -111,7 +111,7 @@ struct RGB2RGB { _Tp t0 = src[0], t1 = src[1], t2 = src[2]; dst[bi ] = t0; - dst[1] = t1; + dst[1] = t1; dst[bi^2] = t2; if(dcn == 4) { @@ -133,107 +133,101 @@ struct RGB5x52RGB RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits) : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) - { - #if CV_NEON - v_n3 = vdupq_n_u16(~3); - v_n7 = vdupq_n_u16(~7); - v_255 = vdupq_n_u8(255); - v_0 = vdupq_n_u8(0); - v_mask = vdupq_n_u16(0x8000); - #endif - } + { } void operator()(const uchar* src, uchar* dst, int n) const { - int dcn = dstcn, bidx = blueIdx, i = 0; - if( greenBits == 6 ) + int dcn = dstcn, bidx = blueIdx, gb = greenBits; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255); + for(; i <= n-vsize; + i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn) { - #if CV_NEON - for ( ; i <= n - 16; i += 16, dst += dcn * 16) + v_uint16 t0 = v_reinterpret_as_u16(vx_load(src)); + v_uint16 t1 = v_reinterpret_as_u16(vx_load(src + + sizeof(ushort)*v_uint16::nlanes)); + + //TODO: shorten registers use when v_interleave is available + v_uint8 r, g, b, a; + v_uint16 b0 = (t0 << 11) >> 8; + v_uint16 b1 = (t1 << 11) >> 8; + b = v_pack(b0, b1); + + v_uint16 g0, g1, r0, r1, a0, a1; + + if( gb == 6 ) + { + g0 = ((t0 >> 5) << 10) >> 8; + g1 = ((t1 >> 5) << 10) >> 8; + + r0 = (t0 >> 11) << 3; + r1 = (t1 >> 11) << 3; + + a = vn0; + } + else + { + g0 = ((t0 >> 5) << 11) >> 8; + g1 = ((t1 >> 5) << 11) >> 8; + + r0 = ((t0 >> 10) << 11) >> 8; + r1 = ((t1 >> 10) << 11) >> 8; + + a0 = t0 >> 15; + a1 = t1 >> 15; + a = v_pack(a0, a1); + a = a != vz; + } + g = v_pack(g0, g1); + r = v_pack(r0, r1); + + if(bidx == 2) + swap(b, r); + + if(dcn == 4) { - uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8); - uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3))); - uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)), - vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3))); - uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)), - vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7))); - if (dcn == 3) - { - uint8x16x3_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - vst3q_u8(dst, v_dst); - } - else - { - uint8x16x4_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - v_dst.val[3] = v_255; - vst4q_u8(dst, v_dst); - } + v_store_interleave(dst, b, g, r, a); } - #endif - for( ; i < n; i++, dst += dcn ) + else { - unsigned t = ((const ushort*)src)[i]; - dst[bidx] = (uchar)(t << 3); - dst[1] = (uchar)((t >> 3) & ~3); - dst[bidx ^ 2] = (uchar)((t >> 8) & ~7); - if( dcn == 4 ) - dst[3] = 255; + v_store_interleave(dst, b, g, r); } } - else + vx_cleanup(); +#endif + + for( ; i < n; i++, src += sizeof(ushort), dst += dcn ) { - #if CV_NEON - for ( ; i <= n - 16; i += 16, dst += dcn * 16) + unsigned t = ((const ushort*)src)[0]; + uchar b, g, r, a; + + b = (uchar)(t << 3); + + if( gb == 6 ) { - uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8); - uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3))); - uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)), - vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7))); - uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)), - vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7))); - if (dcn == 3) - { - uint8x16x3_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - vst3q_u8(dst, v_dst); - } - else - { - uint8x16x4_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)), - vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0); - vst4q_u8(dst, v_dst); - } + g = (uchar)((t >> 3) & ~3); + r = (uchar)((t >> 8) & ~7); + a = 255; } - #endif - for( ; i < n; i++, dst += dcn ) + else { - unsigned t = ((const ushort*)src)[i]; - dst[bidx] = (uchar)(t << 3); - dst[1] = (uchar)((t >> 2) & ~7); - dst[bidx ^ 2] = (uchar)((t >> 7) & ~7); - if( dcn == 4 ) - dst[3] = t & 0x8000 ? 255 : 0; + g = (uchar)((t >> 2) & ~7); + r = (uchar)((t >> 7) & ~7); + a = (uchar)(((t & 0x8000) >> 15) * 255); } + + dst[bidx] = b; + dst[1] = g; + dst[bidx ^ 2] = r; + if( dcn == 4 ) + dst[3] = a; } } int dstcn, blueIdx, greenBits; - #if CV_NEON - uint16x8_t v_n3, v_n7, v_mask; - uint8x16_t v_255, v_0; - #endif }; @@ -243,117 +237,131 @@ struct RGB2RGB5x5 RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits) : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) - { - #if CV_NEON - v_n3 = vdup_n_u8(~3); - v_n7 = vdup_n_u8(~7); - v_mask = vdupq_n_u16(0x8000); - v_0 = vdupq_n_u16(0); - v_full = vdupq_n_u16(0xffff); - #endif - } + { } void operator()(const uchar* src, uchar* dst, int n) const { - int scn = srccn, bidx = blueIdx, i = 0; - if (greenBits == 6) + int scn = srccn, bidx = blueIdx, gb = greenBits; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint16 vn3 = vx_setall_u16((ushort)(~3)); + v_uint16 vn7 = vx_setall_u16((ushort)(~7)); + v_uint16 vz = vx_setzero_u16(); + v_uint8 v7 = vx_setall_u8((uchar)(~7)); + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*sizeof(ushort)) { - if (scn == 3) + v_uint8 r, g, b, a; + if(scn == 3) { - #if CV_NEON - for ( ; i <= n - 8; i += 8, src += 24 ) - { - uint8x8x3_t v_src = vld3_u8(src); - uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8)); - vst1q_u16((ushort *)dst + i, v_dst); - } - #endif - for ( ; i < n; i++, src += 3 ) - ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8)); + v_load_deinterleave(src, b, g, r); + a = vx_setzero_u8(); } else { - #if CV_NEON - for ( ; i <= n - 8; i += 8, src += 32 ) - { - uint8x8x4_t v_src = vld4_u8(src); - uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8)); - vst1q_u16((ushort *)dst + i, v_dst); - } - #endif - for ( ; i < n; i++, src += 4 ) - ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8)); + v_load_deinterleave(src, b, g, r, a); } - } - else if (scn == 3) - { - #if CV_NEON - for ( ; i <= n - 8; i += 8, src += 24 ) + if(bidx == 2) + swap(b, r); + + r = r & v7; + + //TODO: shorten registers use when v_deinterleave is available + v_uint16 r0, r1, g0, g1, b0, b1, a0, a1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + v_expand(a, a0, a1); + + v_uint16 d0, d1; + + b0 = b0 >> 3; + b1 = b1 >> 3; + a0 = (a0 != vz) << 15; + a1 = (a1 != vz) << 15; + + if(gb == 6) { - uint8x8x3_t v_src = vld3_u8(src); - uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7)); - vst1q_u16((ushort *)dst + i, v_dst); + d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8); + d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8); } - #endif - for ( ; i < n; i++, src += 3 ) - ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7)); + else + { + d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0; + d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1; + } + + v_store((ushort*)dst, d0); + v_store(((ushort*)dst) + vsize/2, d1); } - else + vx_cleanup(); +#endif + for ( ; i < n; i++, src += scn, dst += sizeof(ushort) ) { - #if CV_NEON - for ( ; i <= n - 8; i += 8, src += 32 ) + uchar r = src[bidx^2]; + uchar g = src[1]; + uchar b = src[bidx]; + uchar a = scn == 4 ? src[3] : 0; + + ushort d; + if (gb == 6) + { + d = (ushort)((b >> 3)|((g & ~3) << 3)|((r & ~7) << 8)); + } + else { - uint8x8x4_t v_src = vld4_u8(src); - uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2)); - v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7), - vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0))); - vst1q_u16((ushort *)dst + i, v_dst); + d = (ushort)((b >> 3)|((g & ~7) << 2)|((r & ~7) << 7)|(a ? 0x8000 : 0)); } - #endif - for ( ; i < n; i++, src += 4 ) - ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)| - ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0)); + ((ushort*)dst)[0] = d; } } int srccn, blueIdx, greenBits; - #if CV_NEON - uint8x8_t v_n3, v_n7; - uint16x8_t v_mask, v_0, v_full; - #endif }; + ///////////////////////////////// Color to/from Grayscale //////////////////////////////// template struct Gray2RGB { typedef _Tp channel_type; + typedef typename v_type<_Tp>::t vt; Gray2RGB(int _dstcn) : dstcn(_dstcn) {} void operator()(const _Tp* src, _Tp* dst, int n) const { - if( dstcn == 3 ) - for( int i = 0; i < n; i++, dst += 3 ) + int dcn = dstcn; + int i = 0; + _Tp alpha = ColorChannel<_Tp>::max(); + +#if CV_SIMD + const int vsize = vt::nlanes; + vt valpha = v_set<_Tp>::set(alpha); + for(; i <= n-vsize; + i += vsize, src += vsize, dst += vsize*dcn) + { + vt g = vx_load(src); + + if(dcn == 3) { - dst[0] = dst[1] = dst[2] = src[i]; + v_store_interleave(dst, g, g, g); } - else - { - _Tp alpha = ColorChannel<_Tp>::max(); - for( int i = 0; i < n; i++, dst += 4 ) + else { - dst[0] = dst[1] = dst[2] = src[i]; - dst[3] = alpha; + v_store_interleave(dst, g, g, g, valpha); } } + vx_cleanup(); +#endif + for ( ; i < n; i++, src++, dst += dcn ) + { + dst[0] = dst[1] = dst[2] = src[0]; + if(dcn == 4) + dst[3] = alpha; + } } int dstcn; @@ -365,104 +373,55 @@ struct Gray2RGB5x5 typedef uchar channel_type; Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) - { - #if CV_NEON - v_n7 = vdup_n_u8(~7); - v_n3 = vdup_n_u8(~3); - #elif CV_SSE2 - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - v_n7 = _mm_set1_epi16(~7); - v_n3 = _mm_set1_epi16(~3); - v_zero = _mm_setzero_si128(); - #endif - } + { } void operator()(const uchar* src, uchar* dst, int n) const { + int gb = greenBits; int i = 0; - if( greenBits == 6 ) +#if CV_SIMD + const int vsize = v_uint16::nlanes; + v_uint16 v3 = vx_setall_u16((ushort)(~3)); + for(; i <= n-vsize; + i += vsize, src += vsize, dst += vsize*sizeof(ushort)) { - #if CV_NEON - for ( ; i <= n - 8; i += 8 ) - { - uint8x8_t v_src = vld1_u8(src + i); - uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3)); - v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8)); - vst1q_u16((ushort *)dst + i, v_dst); - } - #elif CV_SSE2 - if (haveSIMD) + v_uint8 t8 = vx_load_low(src); + v_uint16 t = v_expand_low(t8); + + v_uint16 t3 = t >> 3; + + v_uint16 d = t3; + if(gb == 6) { - for ( ; i <= n - 16; i += 16 ) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - - __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); - __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - - v_src_p = _mm_unpackhi_epi8(v_src, v_zero); - v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), - _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), - _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); - } + d |= ((t & v3) << 3) | (t3 << 11); } - #endif - for ( ; i < n; i++ ) + else { - int t = src[i]; - ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8)); + d |= (t3 << 5) | (t3 << 10); } + + v_store((ushort*)dst, d); } - else + vx_cleanup(); +#endif + + for( ; i < n; i++, src++, dst += sizeof(ushort)) { - #if CV_NEON - for ( ; i <= n - 8; i += 8 ) + int t = src[0]; + int t3 = t >> 3; + ushort d; + if( gb == 6 ) { - uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3)); - uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10)); - vst1q_u16((ushort *)dst + i, v_dst); + d = (ushort)(t3 |((t & ~3) << 3)|(t3 << 11)); } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; i <= n - 16; i += 8 ) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); - - __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); - __m128i v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi32(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); - - v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); - v_dst = _mm_or_si128(v_src_p, - _mm_or_si128(_mm_slli_epi16(v_src_p, 5), - _mm_slli_epi16(v_src_p, 10))); - _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); - } - } - #endif - for( ; i < n; i++ ) + else { - int t = src[i] >> 3; - ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10)); + d = (ushort)(t3 |(t3 << 5)|(t3 << 10)); } + ((ushort*)dst)[0] = d; } } int greenBits; - - #if CV_NEON - uint8x8_t v_n7, v_n3; - #elif CV_SSE2 - __m128i v_n7, v_n3, v_zero; - bool haveSIMD; - #endif }; @@ -470,156 +429,96 @@ struct RGB5x52Gray { typedef uchar channel_type; + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + RGB5x52Gray(int _greenBits) : greenBits(_greenBits) { - #if CV_NEON - v_b2y = vdup_n_u16(B2Y); - v_g2y = vdup_n_u16(G2Y); - v_r2y = vdup_n_u16(R2Y); - v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); - v_f8 = vdupq_n_u16(0xf8); - v_fc = vdupq_n_u16(0xfc); - #elif CV_SSE2 - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - const __m128i v_b2y = _mm_set1_epi16(B2Y); - const __m128i v_g2y = _mm_set1_epi16(G2Y); - v_bg2y = _mm_unpacklo_epi16(v_b2y, v_g2y); - const __m128i v_r2y = _mm_set1_epi16(R2Y); - const __m128i v_one = _mm_set1_epi16(1); - v_rd2y = _mm_unpacklo_epi16(v_r2y, v_one); - v_delta = _mm_slli_epi16(v_one, yuv_shift - 1); - #endif + CV_Assert(BY + GY + RY == (1 << shift)); } void operator()(const uchar* src, uchar* dst, int n) const { + int gb = greenBits; int i = 0; - if( greenBits == 6 ) +#if CV_SIMD + const int vsize = v_uint16::nlanes; + + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16(BY), vx_setall_s16(GY), bg2y, dummy); + v_zip(vx_setall_s16(RY), vx_setall_s16( 1), r12y, dummy); + v_int16 delta = vx_setall_s16(1 << (shift-1)); + + for(; i <= n-vsize; + i += vsize, src += vsize*sizeof(ushort), dst += vsize) { - #if CV_NEON - for ( ; i <= n - 8; i += 8) - { - uint16x8_t v_src = vld1q_u16((ushort *)src + i); - uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8), - v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc), - v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8); - - uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y), - vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y); - uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y), - vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y); - v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift); - v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift); - - vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); - } - #elif CV_SSE2 - if (haveSIMD) + v_uint16 t = vx_load((ushort*)src); + + v_uint16 r, g, b; + b = (t << 11) >> 8; + + if(gb == 5) { - for ( ; i <= n - 8; i += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8), - v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 10),8), - v_r = _mm_slli_epi16(_mm_srli_epi16(v_src, 11), 3); - - __m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g); - __m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta); - __m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g); - __m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta); - v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y); - v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y); - v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y); - v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y); - - __m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo); - __m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi); - v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift); - v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift); - - __m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi); - v_dst = _mm_packus_epi16(v_dst, v_dst); - _mm_storel_epi64((__m128i *)(dst + i), v_dst); - } + g = ((t >> 5) << 11) >> 8; + r = ((t >> 10) << 11) >> 8; } - #endif - for ( ; i < n; i++) + else { - int t = ((ushort*)src)[i]; - dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 3) & 0xfc)*G2Y + - ((t >> 8) & 0xf8)*R2Y, yuv_shift); + g = ((t >> 5) << 10) >> 8; + r = (t >> 11) << 3; } + + v_uint8 d; + v_uint16 dx; + + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); + v_int16 sb = v_reinterpret_as_s16(b); + + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, delta, rd0, rd1); + + v_uint32 d0, d1; + d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)); + d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)); + + d0 = d0 >> shift; + d1 = d1 >> shift; + + dx = v_pack(d0, d1); + // high part isn't used + d = v_pack(dx, dx); + + v_store_low(dst, d); } - else + vx_cleanup(); +#endif + for( ; i < n; i++, src += sizeof(ushort), dst++) { - #if CV_NEON - for ( ; i <= n - 8; i += 8) + int t = ((ushort*)src)[0]; + uchar r, g, b; + b = (t << 3) & 0xf8; + if( gb == 6 ) { - uint16x8_t v_src = vld1q_u16((ushort *)src + i); - uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8), - v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8), - v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8); - - uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y), - vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y); - uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y), - vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y); - v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift); - v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift); - - vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); + g = (t >> 3) & 0xfc; + r = (t >> 8) & 0xf8; } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; i <= n - 8; i += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); - __m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8), - v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 11),8), - v_r = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 10), 11),8); - - __m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g); - __m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta); - __m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g); - __m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta); - v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y); - v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y); - v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y); - v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y); - - __m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo); - __m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi); - v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift); - v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift); - - __m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi); - v_dst = _mm_packus_epi16(v_dst, v_dst); - _mm_storel_epi64((__m128i *)(dst + i), v_dst); - } - } - #endif - for ( ; i < n; i++) + else { - int t = ((ushort*)src)[i]; - dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 2) & 0xf8)*G2Y + - ((t >> 7) & 0xf8)*R2Y, yuv_shift); + g = (t >> 2) & 0xf8; + r = (t >> 7) & 0xf8; } + dst[0] = (uchar)CV_DESCALE(b*BY + g*GY + r*RY, shift); } } int greenBits; - - #if CV_NEON - uint16x4_t v_b2y, v_g2y, v_r2y; - uint32x4_t v_delta; - uint16x8_t v_f8, v_fc; - #elif CV_SSE2 - bool haveSIMD; - __m128i v_bg2y, v_rd2y; - __m128i v_delta; - #endif }; @@ -646,128 +545,6 @@ template struct RGB2Gray float coeffs[3]; }; -template<> struct RGB2Gray -{ - typedef uchar channel_type; - - RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn) - { - const int coeffs0[] = { R2Y, G2Y, B2Y }; - if(!coeffs) coeffs = coeffs0; - - int b = 0, g = 0, r = (1 << (yuv_shift-1)); - int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx]; - - for( int i = 0; i < 256; i++, b += db, g += dg, r += dr ) - { - tab[i] = b; - tab[i+256] = g; - tab[i+512] = r; - } - } - void operator()(const uchar* src, uchar* dst, int n) const - { - int scn = srccn; - const int* _tab = tab; - for(int i = 0; i < n; i++, src += scn) - dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift); - } - int srccn; - int tab[256*3]; -}; - -#if CV_NEON - -template <> -struct RGB2Gray -{ - typedef ushort channel_type; - - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : - srccn(_srccn) - { - static const int coeffs0[] = { R2Y, G2Y, B2Y }; - memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); - if( blueIdx == 0 ) - std::swap(coeffs[0], coeffs[2]); - - v_cb = vdup_n_u16(coeffs[0]); - v_cg = vdup_n_u16(coeffs[1]); - v_cr = vdup_n_u16(coeffs[2]); - v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; - - for ( ; i <= n - 8; i += 8, src += scn * 8) - { - uint16x8_t v_b, v_r, v_g; - if (scn == 3) - { - uint16x8x3_t v_src = vld3q_u16(src); - v_b = v_src.val[0]; - v_g = v_src.val[1]; - v_r = v_src.val[2]; - } - else - { - uint16x8x4_t v_src = vld4q_u16(src); - v_b = v_src.val[0]; - v_g = v_src.val[1]; - v_r = v_src.val[2]; - } - - uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16( - vmull_u16(vget_low_u16(v_b), v_cb), - vget_low_u16(v_g), v_cg), - vget_low_u16(v_r), v_cr); - uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16( - vmull_u16(vget_high_u16(v_b), v_cb), - vget_high_u16(v_g), v_cg), - vget_high_u16(v_r), v_cr); - - uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift)); - uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift)); - - vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1)); - } - - for ( ; i <= n - 4; i += 4, src += scn * 4) - { - uint16x4_t v_b, v_r, v_g; - if (scn == 3) - { - uint16x4x3_t v_src = vld3_u16(src); - v_b = v_src.val[0]; - v_g = v_src.val[1]; - v_r = v_src.val[2]; - } - else - { - uint16x4x4_t v_src = vld4_u16(src); - v_b = v_src.val[0]; - v_g = v_src.val[1]; - v_r = v_src.val[2]; - } - - uint32x4_t v_dst = vmlal_u16(vmlal_u16( - vmull_u16(v_b, v_cb), - v_g, v_cg), - v_r, v_cr); - - vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift))); - } - - for( ; i < n; i++, src += scn) - dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); - } - - int srccn, coeffs[3]; - uint16x4_t v_cb, v_cg, v_cr; - uint32x4_t v_delta; -}; template <> struct RGB2Gray @@ -777,13 +554,12 @@ struct RGB2Gray RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) { static const float coeffs0[] = { R2YF, G2YF, B2YF }; - memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + for(int i = 0; i < 3; i++) + { + coeffs[i] = _coeffs ? _coeffs[i] : coeffs0[i]; + } if(blueIdx == 0) std::swap(coeffs[0], coeffs[2]); - - v_cb = vdupq_n_f32(coeffs[0]); - v_cg = vdupq_n_f32(coeffs[1]); - v_cr = vdupq_n_f32(coeffs[2]); } void operator()(const float * src, float * dst, int n) const @@ -791,297 +567,225 @@ struct RGB2Gray int scn = srccn, i = 0; float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - if (scn == 3) +#if CV_SIMD + const int vsize = v_float32::nlanes; + v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb); + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize) { - for ( ; i <= n - 8; i += 8, src += scn * 8) + v_float32 r, g, b, a; + if(scn == 3) { - float32x4x3_t v_src = vld3q_f32(src); - vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); - - v_src = vld3q_f32(src + scn * 4); - vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + v_load_deinterleave(src, b, g, r); } - - for ( ; i <= n - 4; i += 4, src += scn * 4) + else { - float32x4x3_t v_src = vld3q_f32(src); - vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + v_load_deinterleave(src, b, g, r, a); } - } - else - { - for ( ; i <= n - 8; i += 8, src += scn * 8) - { - float32x4x4_t v_src = vld4q_f32(src); - vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); - v_src = vld4q_f32(src + scn * 4); - vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); - } + v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv)); - for ( ; i <= n - 4; i += 4, src += scn * 4) - { - float32x4x4_t v_src = vld4q_f32(src); - vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); - } + v_store(dst, d); } + vx_cleanup(); +#endif - for ( ; i < n; i++, src += scn) - dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; + for ( ; i < n; i++, src += scn, dst++) + dst[0] = src[0]*cb + src[1]*cg + src[2]*cr; } int srccn; float coeffs[3]; - float32x4_t v_cb, v_cg, v_cr; }; -#elif CV_SSE2 - -#if CV_SSE4_1 - -template <> -struct RGB2Gray +template<> +struct RGB2Gray { - typedef ushort channel_type; + typedef uchar channel_type; - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : - srccn(_srccn) + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) { - static const int coeffs0[] = { R2Y, G2Y, B2Y }; - memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); - if( blueIdx == 0 ) + const int coeffs0[] = { RY, GY, BY }; + for(int i = 0; i < 3; i++) + coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); + if(blueIdx == 0) std::swap(coeffs[0], coeffs[2]); - v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); - v_zero = _mm_setzero_si128(); - - haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); + CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); } - // 16s x 8 - void process(__m128i* v_rgb, __m128i* v_coeffs, - __m128i & v_gray) const + void operator()(const uchar* src, uchar* dst, int n) const { - __m128i v_rgb_hi[4]; - v_rgb_hi[0] = _mm_cmplt_epi16(v_rgb[0], v_zero); - v_rgb_hi[1] = _mm_cmplt_epi16(v_rgb[1], v_zero); - v_rgb_hi[2] = _mm_cmplt_epi16(v_rgb[2], v_zero); - v_rgb_hi[3] = _mm_cmplt_epi16(v_rgb[3], v_zero); - - v_rgb_hi[0] = _mm_and_si128(v_rgb_hi[0], v_coeffs[1]); - v_rgb_hi[1] = _mm_and_si128(v_rgb_hi[1], v_coeffs[1]); - v_rgb_hi[2] = _mm_and_si128(v_rgb_hi[2], v_coeffs[1]); - v_rgb_hi[3] = _mm_and_si128(v_rgb_hi[3], v_coeffs[1]); - - v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[1]); - v_rgb_hi[2] = _mm_hadd_epi16(v_rgb_hi[2], v_rgb_hi[3]); - v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[2]); - - v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]); - v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[0]); - v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[0]); - v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]); + int scn = srccn; + short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + int i = 0; - v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]); - v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]); +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy); + v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy); + v_int16 delta = vx_setall_s16(1 << (shift-1)); + + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += vsize) + { + v_uint8 r, g, b, a; + if(scn == 3) + { + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } - v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta); - v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta); + //TODO: shorten registers use when v_deinterleave is available - v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); - v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); - v_gray = _mm_packs_epi32(v_rgb[0], v_rgb[2]); - v_gray = _mm_add_epi16(v_gray, v_rgb_hi[0]); - } + v_int16 bg00, bg01, bg10, bg11; + v_int16 rd00, rd01, rd10, rd11; + v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01); + v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11); + v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01); + v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11); - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; + v_uint32 y00, y01, y10, y11; + y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; + y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; + y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; + y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; - if (scn == 3 && haveSIMD) - { - __m128i v_coeffs[2]; - v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0); - v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2); + v_uint16 y0, y1; + y0 = v_pack(y00, y01); + y1 = v_pack(y10, y11); - for ( ; i <= n - 8; i += 8, src += scn * 8) - { - __m128i v_src[3]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src)); - v_src[1] = _mm_loadu_si128((__m128i const *)(src + 8)); - v_src[2] = _mm_loadu_si128((__m128i const *)(src + 16)); - - __m128i v_rgb[4]; - v_rgb[0] = _mm_slli_si128(v_src[0], 2); - v_rgb[1] = _mm_alignr_epi8(v_src[1], v_src[0], 10); - v_rgb[2] = _mm_alignr_epi8(v_src[2], v_src[1], 6); - v_rgb[3] = _mm_srli_si128(v_src[2], 2); - - __m128i v_gray; - process(v_rgb, v_coeffs, - v_gray); - - _mm_storeu_si128((__m128i *)(dst + i), v_gray); - } + v_uint8 y = v_pack(y0, y1); + v_store(dst, y); } - else if (scn == 4 && haveSIMD) - { - __m128i v_coeffs[2]; - v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0]); - v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2); - - for ( ; i <= n - 8; i += 8, src += scn * 8) - { - __m128i v_rgb[4]; - v_rgb[0] = _mm_loadu_si128((__m128i const *)(src)); - v_rgb[1] = _mm_loadu_si128((__m128i const *)(src + 8)); - v_rgb[2] = _mm_loadu_si128((__m128i const *)(src + 16)); - v_rgb[3] = _mm_loadu_si128((__m128i const *)(src + 24)); - - __m128i v_gray; - process(v_rgb, v_coeffs, - v_gray); + vx_cleanup(); +#endif - _mm_storeu_si128((__m128i *)(dst + i), v_gray); - } + for( ; i < n; i++, src += scn, dst++) + { + int b = src[0], g = src[1], r = src[2]; + uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift); + dst[0] = y; } - - for( ; i < n; i++, src += scn) - dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); } - int srccn, coeffs[3]; - __m128i v_delta; - __m128i v_zero; - bool haveSIMD; + int srccn; + short coeffs[3]; }; -#endif // CV_SSE4_1 -template <> -struct RGB2Gray +template<> +struct RGB2Gray { - typedef float channel_type; + typedef ushort channel_type; - RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + // can be changed to 15-shift coeffs + static const int BY = B2Y; + static const int GY = G2Y; + static const int RY = R2Y; + static const int shift = yuv_shift; + static const int fix_shift = (int)(sizeof(short)*8 - shift); + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) { - static const float coeffs0[] = { R2YF, G2YF, B2YF }; - memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + const int coeffs0[] = { RY, GY, BY }; + for(int i = 0; i < 3; i++) + coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); if(blueIdx == 0) std::swap(coeffs[0], coeffs[2]); - v_cb = _mm_set1_ps(coeffs[0]); - v_cg = _mm_set1_ps(coeffs[1]); - v_cr = _mm_set1_ps(coeffs[2]); - - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); } - void process(__m128 v_b, __m128 v_g, __m128 v_r, - __m128 & v_gray) const - { - v_gray = _mm_mul_ps(v_r, v_cr); - v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg)); - v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb)); - } - - void operator()(const float * src, float * dst, int n) const + void operator()(const ushort* src, ushort* dst, int n) const { - int scn = srccn, i = 0; - float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + int scn = srccn; + short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + int i = 0; - if (scn == 3 && haveSIMD) - { - for ( ; i <= n - 8; i += 8, src += scn * 8) - { - __m128 v_r0 = _mm_loadu_ps(src); - __m128 v_r1 = _mm_loadu_ps(src + 4); - __m128 v_g0 = _mm_loadu_ps(src + 8); - __m128 v_g1 = _mm_loadu_ps(src + 12); - __m128 v_b0 = _mm_loadu_ps(src + 16); - __m128 v_b1 = _mm_loadu_ps(src + 20); +#if CV_SIMD + const int vsize = v_uint16::nlanes; - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + v_int16 b2y = vx_setall_s16(cb); + v_int16 g2y = vx_setall_s16(cg); + v_int16 r2y = vx_setall_s16(cr); + v_int16 one = vx_setall_s16(1); + v_int16 z = vx_setzero_s16(); - __m128 v_gray0; - process(v_r0, v_g0, v_b0, - v_gray0); + v_int16 bg2y, r12y; + v_int16 dummy; + v_zip(b2y, g2y, bg2y, dummy); + v_zip(r2y, one, r12y, dummy); - __m128 v_gray1; - process(v_r1, v_g1, v_b1, - v_gray1); + v_int16 delta = vx_setall_s16(1 << (shift-1)); - _mm_storeu_ps(dst + i, v_gray0); - _mm_storeu_ps(dst + i + 4, v_gray1); - } - } - else if (scn == 4 && haveSIMD) + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += vsize) { - for ( ; i <= n - 8; i += 8, src += scn * 8) + v_uint16 r, g, b, a; + if(scn == 3) { - __m128 v_r0 = _mm_loadu_ps(src); - __m128 v_r1 = _mm_loadu_ps(src + 4); - __m128 v_g0 = _mm_loadu_ps(src + 8); - __m128 v_g1 = _mm_loadu_ps(src + 12); - __m128 v_b0 = _mm_loadu_ps(src + 16); - __m128 v_b1 = _mm_loadu_ps(src + 20); - __m128 v_a0 = _mm_loadu_ps(src + 24); - __m128 v_a1 = _mm_loadu_ps(src + 28); - - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); - - __m128 v_gray0; - process(v_r0, v_g0, v_b0, - v_gray0); - - __m128 v_gray1; - process(v_r1, v_g1, v_b1, - v_gray1); - - _mm_storeu_ps(dst + i, v_gray0); - _mm_storeu_ps(dst + i + 4, v_gray1); + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); } - } - for ( ; i < n; i++, src += scn) - dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; - } + v_int16 sb = v_reinterpret_as_s16(b); + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); - int srccn; - float coeffs[3]; - __m128 v_cb, v_cg, v_cr; - bool haveSIMD; -}; + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, delta, rd0, rd1); -#endif // CV_SSE2 + // fixing 16bit signed multiplication + v_int16 mr, mg, mb; + mr = (sr < z) & r2y; + mg = (sg < z) & g2y; + mb = (sb < z) & b2y; + v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; -#if !CV_NEON && !CV_SSE4_1 + v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; + v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; -template<> struct RGB2Gray -{ - typedef ushort channel_type; + v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul); - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) - { - static const int coeffs0[] = { R2Y, G2Y, B2Y }; - memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); - if( blueIdx == 0 ) - std::swap(coeffs[0], coeffs[2]); + v_store((short*)dst, y); + } + vx_cleanup(); +#endif + for( ; i < n; i++, src += scn, dst++) + { + int b = src[0], g = src[1], r = src[2]; + ushort d = (ushort)CV_DESCALE((unsigned)(b*cb + g*cg + r*cr), shift); + dst[0] = d; + } } - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - for(int i = 0; i < n; i++, src += scn) - dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); - } int srccn; - int coeffs[3]; + short coeffs[3]; }; -#endif // !CV_NEON && !CV_SSE4_1 - /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) ////////////// @@ -1110,6 +814,88 @@ struct RGBA2mRGBA }; +template<> +struct RGBA2mRGBA +{ + typedef uchar channel_type; + + void operator()(const uchar* src, uchar* dst, int n) const + { + const uchar max_val = 255; + const uchar half_val = 128; + + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); + v_uint16 vh = vx_setall_u16(half_val+1); + + // processing 4 registers per loop cycle is about 10% faster + // than processing 1 register + for( ; i <= n-vsize; + i += vsize, src += 4*vsize, dst += 4*vsize) + { + v_uint8 v[4]; + for(int j = 0; j < 4; j++) + v[j] = vx_load(src + j*vsize); + + // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => + // => 00,00,a0,a0,00,00,a1,a1 + // => a0,a0,a0,a0,a1,a1,a1,a1 + + v_uint16 a16[4]; + for(int j = 0; j < 4; j++) + a16[j] = v_reinterpret_as_u16(v[j] & amask); + + v_uint32 a32[4]; + for(int j = 0; j < 4; j++) + a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8)); + + v_uint8 a[4]; + for(int j = 0; j < 4; j++) + a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16)); + + v_uint16 m[8]; + for(int j = 0; j < 4; j++) + v_mul_expand(v[j], a[j], m[j], m[j+4]); + + for(int j = 0; j < 8; j++) + m[j] += vh; + + // div 255: (v+1+(v>>8))>8 + // +1 is in vh, has no effect on (v>>8) + for(int j = 0; j < 8; j++) + m[j] = (m[j] + (m[j] >> 8)) >> 8; + + v_uint8 d[4]; + for(int j = 0; j < 4; j++) + d[j] = v_pack(m[j], m[j+4]); + + for(int j = 0; j < 4; j++) + d[j] = v_select(amask, a[j], d[j]); + + for(int j = 0; j < 4; j++) + v_store(dst + j*vsize, d[j]); + } + + vx_cleanup(); +#endif + for(; i < n; i++, src += 4, dst += 4 ) + { + uchar v0 = src[0]; + uchar v1 = src[1]; + uchar v2 = src[2]; + uchar v3 = src[3]; + + dst[0] = (v0 * v3 + half_val) / max_val; + dst[1] = (v1 * v3 + half_val) / max_val; + dst[2] = (v2 * v3 + half_val) / max_val; + dst[3] = v3; + } + } +}; + + template struct mRGBA2RGBA { @@ -1126,14 +912,139 @@ struct mRGBA2RGBA _Tp v3 = *src++; _Tp v3_half = v3 / 2; - *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3; - *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3; - *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3; + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v0 * max_val + v3_half) / v3); + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v1 * max_val + v3_half) / v3); + *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v2 * max_val + v3_half) / v3); *dst++ = v3; } } }; + +template<> +struct mRGBA2RGBA +{ + typedef uchar channel_type; + + void operator()(const uchar* src, uchar* dst, int n) const + { + uchar max_val = ColorChannel::max(); + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); + v_uint8 vmax = vx_setall_u8(max_val); + + for( ; i <= n-vsize/4; + i += vsize/4, src += vsize, dst += vsize) + { + v_uint8 s = vx_load(src + 0*vsize); + + // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => + // => 00,00,a0,a0,00,00,a1,a1 + // => a0,a0,a0,a0,a1,a1,a1,a1 + v_uint8 a; + v_uint16 a16; + v_uint32 a32; + a16 = v_reinterpret_as_u16(s & amask); + a32 = v_reinterpret_as_u32(a16 | (a16 >> 8)); + a = v_reinterpret_as_u8(a32 | (a32 >> 16)); + + // s *= max_val + v_uint16 s0, s1; + v_mul_expand(s, vmax, s0, s1); + + // s += a/2 + v_uint16 ae0, ae1; + v_expand(a, ae0, ae1); + s0 += ae0 >> 1; s1 += ae1 >> 1; + + // s, a -> u32 -> float + v_uint32 u00, u01, u10, u11; + v_int32 s00, s01, s10, s11; + v_expand(s0, u00, u01); + v_expand(s1, u10, u11); + s00 = v_reinterpret_as_s32(u00); + s01 = v_reinterpret_as_s32(u01); + s10 = v_reinterpret_as_s32(u10); + s11 = v_reinterpret_as_s32(u11); + + v_uint32 ua00, ua01, ua10, ua11; + v_int32 a00, a01, a10, a11; + v_expand(ae0, ua00, ua01); + v_expand(ae1, ua10, ua11); + a00 = v_reinterpret_as_s32(ua00); + a01 = v_reinterpret_as_s32(ua01); + a10 = v_reinterpret_as_s32(ua10); + a11 = v_reinterpret_as_s32(ua11); + + v_float32 fs00, fs01, fs10, fs11; + fs00 = v_cvt_f32(s00); + fs01 = v_cvt_f32(s01); + fs10 = v_cvt_f32(s10); + fs11 = v_cvt_f32(s11); + + v_float32 fa00, fa01, fa10, fa11; + fa00 = v_cvt_f32(a00); + fa01 = v_cvt_f32(a01); + fa10 = v_cvt_f32(a10); + fa11 = v_cvt_f32(a11); + + // float d = (float)s/(float)a + v_float32 fd00, fd01, fd10, fd11; + fd00 = fs00/fa00; + fd01 = fs01/fa01; + fd10 = fs10/fa10; + fd11 = fs11/fa11; + + // d -> u32 -> u8 + v_uint32 ud00, ud01, ud10, ud11; + ud00 = v_reinterpret_as_u32(v_trunc(fd00)); + ud01 = v_reinterpret_as_u32(v_trunc(fd01)); + ud10 = v_reinterpret_as_u32(v_trunc(fd10)); + ud11 = v_reinterpret_as_u32(v_trunc(fd11)); + v_uint16 ud0, ud1; + ud0 = v_pack(ud00, ud01); + ud1 = v_pack(ud10, ud11); + v_uint8 d; + d = v_pack(ud0, ud1); + + // if a == 0 then d = 0 + v_uint8 am; + am = a != vx_setzero_u8(); + d = d & am; + + // put alpha values + d = v_select(amask, a, d); + + v_store(dst, d); + } + + vx_cleanup(); +#endif + for(; i < n; i++, src += 4, dst += 4 ) + { + uchar v0 = src[0]; + uchar v1 = src[1]; + uchar v2 = src[2]; + uchar v3 = src[3]; + + uchar v3_half = v3 / 2; + + dst[0] = (v3==0)? 0 : (v0 * max_val + v3_half) / v3; + dst[1] = (v3==0)? 0 : (v1 * max_val + v3_half) / v3; + dst[2] = (v3==0)? 0 : (v2 * max_val + v3_half) / v3; + dst[3] = v3; + + dst[0] = (v3==0)? 0 : saturate_cast((v0 * max_val + v3_half) / v3); + dst[1] = (v3==0)? 0 : saturate_cast((v1 * max_val + v3_half) / v3); + dst[2] = (v3==0)? 0 : saturate_cast((v2 * max_val + v3_half) / v3); + dst[3] = v3; + } + } +}; + // // IPP functions // diff --git a/modules/imgproc/src/opencl/color_rgb.cl b/modules/imgproc/src/opencl/color_rgb.cl index b4041f40c8..d57433252b 100644 --- a/modules/imgproc/src/opencl/color_rgb.cl +++ b/modules/imgproc/src/opencl/color_rgb.cl @@ -439,9 +439,10 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0); else *(__global uchar4 *)(dst + dst_index) = - (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3, - mad24(src_pix.y, MAX_NUM, v3_half) / v3, - mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3); + (uchar4)(SAT_CAST(mad24(src_pix.x, MAX_NUM, v3_half) / v3), + SAT_CAST(mad24(src_pix.y, MAX_NUM, v3_half) / v3), + SAT_CAST(mad24(src_pix.z, MAX_NUM, v3_half) / v3), + SAT_CAST(v3)); ++y; dst_index += dst_step; From ab86f15ba0172dbc0b1d34bb813ff301dc4444a2 Mon Sep 17 00:00:00 2001 From: Quentin Chateau Date: Fri, 14 Dec 2018 19:37:00 +0100 Subject: [PATCH 04/13] Merge pull request #13400 from Tytan:optimize_exposure_compensation Optimize exposure compensation (#13400) * Added perf test * Optimized gains computation * Use Eigen for gains calculation --- modules/stitching/perf/perf_stich.cpp | 34 +++++++++ modules/stitching/src/exposure_compensate.cpp | 76 ++++++++++++++++--- 2 files changed, 99 insertions(+), 11 deletions(-) diff --git a/modules/stitching/perf/perf_stich.cpp b/modules/stitching/perf/perf_stich.cpp index 75fb93fea9..22fb3e35e6 100644 --- a/modules/stitching/perf/perf_stich.cpp +++ b/modules/stitching/perf/perf_stich.cpp @@ -13,6 +13,7 @@ using namespace perf; #define WORK_MEGAPIX 0.6 typedef TestBaseWithParam stitch; +typedef TestBaseWithParam stitchExposureCompensation; typedef TestBaseWithParam > stitchDatasets; #ifdef HAVE_OPENCV_XFEATURES2D @@ -20,6 +21,7 @@ typedef TestBaseWithParam > stitchDatasets; #else #define TEST_DETECTORS testing::Values("orb", "akaze") #endif +#define TEST_EXP_COMP_BS testing::Values(32, 16, 12, 10, 8) #define AFFINE_DATASETS testing::Values("s", "budapest", "newspaper", "prague") PERF_TEST_P(stitch, a123, TEST_DETECTORS) @@ -58,6 +60,38 @@ PERF_TEST_P(stitch, a123, TEST_DETECTORS) SANITY_CHECK_NOTHING(); } +PERF_TEST_P(stitchExposureCompensation, a123, TEST_EXP_COMP_BS) +{ + Mat pano; + + vector imgs; + imgs.push_back( imread( getDataPath("stitching/a1.png") ) ); + imgs.push_back( imread( getDataPath("stitching/a2.png") ) ); + imgs.push_back( imread( getDataPath("stitching/a3.png") ) ); + + int bs = GetParam(); + + declare.time(30 * 10).iterations(10); + + while(next()) + { + Ptr stitcher = Stitcher::create(); + stitcher->setWarper(makePtr()); + stitcher->setRegistrationResol(WORK_MEGAPIX); + stitcher->setExposureCompensator( + makePtr(bs, bs)); + + startTimer(); + stitcher->stitch(imgs, pano); + stopTimer(); + } + + EXPECT_NEAR(pano.size().width, 1182, 50); + EXPECT_NEAR(pano.size().height, 682, 30); + + SANITY_CHECK_NOTHING(); +} + PERF_TEST_P(stitch, b12, TEST_DETECTORS) { Mat pano; diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp index 7b72efbd16..2488684912 100644 --- a/modules/stitching/src/exposure_compensate.cpp +++ b/modules/stitching/src/exposure_compensate.cpp @@ -41,6 +41,10 @@ //M*/ #include "precomp.hpp" +#ifdef HAVE_EIGEN +#include +#include +#endif namespace cv { namespace detail { @@ -80,6 +84,7 @@ void GainCompensator::feed(const std::vector &corners, const std::vector< const int num_images = static_cast(images.size()); Mat_ N(num_images, num_images); N.setTo(0); Mat_ I(num_images, num_images); I.setTo(0); + Mat_ skip(num_images, 1); skip.setTo(true); //Rect dst_roi = resultRoi(corners, images); Mat subimg1, subimg2; @@ -99,7 +104,19 @@ void GainCompensator::feed(const std::vector &corners, const std::vector< submask2 = masks[j].first(Rect(roi.tl() - corners[j], roi.br() - corners[j])).getMat(ACCESS_READ); intersect = (submask1 == masks[i].second) & (submask2 == masks[j].second); - N(i, j) = N(j, i) = std::max(1, countNonZero(intersect)); + int intersect_count = countNonZero(intersect); + N(i, j) = N(j, i) = std::max(1, intersect_count); + + // Don't compute Isums if subimages do not intersect anyway + if (intersect_count == 0) + continue; + + // Don't skip images that intersect with at least one other image + if (i != j) + { + skip(i, 0) = false; + skip(j, 0) = false; + } double Isum1 = 0, Isum2 = 0; for (int y = 0; y < roi.height; ++y) @@ -123,22 +140,59 @@ void GainCompensator::feed(const std::vector &corners, const std::vector< double alpha = 0.01; double beta = 100; + int num_eq = num_images - countNonZero(skip); - Mat_ A(num_images, num_images); A.setTo(0); - Mat_ b(num_images, 1); b.setTo(0); - for (int i = 0; i < num_images; ++i) + Mat_ A(num_eq, num_eq); A.setTo(0); + Mat_ b(num_eq, 1); b.setTo(0); + for (int i = 0, ki = 0; i < num_images; ++i) { - for (int j = 0; j < num_images; ++j) + if (skip(i, 0)) + continue; + + for (int j = 0, kj = 0; j < num_images; ++j) { - b(i, 0) += beta * N(i, j); - A(i, i) += beta * N(i, j); - if (j == i) continue; - A(i, i) += 2 * alpha * I(i, j) * I(i, j) * N(i, j); - A(i, j) -= 2 * alpha * I(i, j) * I(j, i) * N(i, j); + if (skip(j, 0)) + continue; + + b(ki, 0) += beta * N(i, j); + A(ki, ki) += beta * N(i, j); + if (j != i) + { + A(ki, ki) += 2 * alpha * I(i, j) * I(i, j) * N(i, j); + A(ki, kj) -= 2 * alpha * I(i, j) * I(j, i) * N(i, j); + } + ++kj; } + ++ki; } - solve(A, b, gains_); + Mat_ l_gains; + +#ifdef HAVE_EIGEN + Eigen::MatrixXf eigen_A, eigen_b, eigen_x; + cv2eigen(A, eigen_A); + cv2eigen(b, eigen_b); + + Eigen::LLT solver(eigen_A); +#if ENABLE_LOG + if (solver.info() != Eigen::ComputationInfo::Success) + LOGLN("Failed to solve exposure compensation system"); +#endif + eigen_x = solver.solve(eigen_b); + + eigen2cv(eigen_x, l_gains); +#else + solve(A, b, l_gains); +#endif + + gains_.create(num_images, 1); + for (int i = 0, j = 0; i < num_images; ++i) + { + if (skip(i, 0)) + gains_.at(i, 0) = 1; + else + gains_.at(i, 0) = l_gains(j++, 0); + } LOGLN("Exposure compensation, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec"); } From 4e16ae9a1f1876a755824009614e163b37cdeb08 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Fri, 14 Dec 2018 19:24:12 +0000 Subject: [PATCH 05/13] core:vsx fix build failure on GCC<=6 due implementation of v_reduce_sum(v_float64x2) --- modules/core/include/opencv2/core/hal/intrin_vsx.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 9506adfe7e..fce5c444ed 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -718,7 +718,7 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) inline double v_reduce_sum(const v_float64x2& a) { - return vec_extract(vec_add(a.val, vec_sld(a.val, a.val, 8)), 0); + return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0); } #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \ From eb1f3733eec947b9972fec612863999adb7c6777 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 15 Dec 2018 07:58:39 +0000 Subject: [PATCH 06/13] videoio(dc1394): use lazy initialization on demand --- modules/videoio/src/cap_dc1394_v2.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/modules/videoio/src/cap_dc1394_v2.cpp b/modules/videoio/src/cap_dc1394_v2.cpp index 45ff438bc1..0d9ffa2569 100644 --- a/modules/videoio/src/cap_dc1394_v2.cpp +++ b/modules/videoio/src/cap_dc1394_v2.cpp @@ -192,7 +192,11 @@ CvDC1394::~CvDC1394() dc = 0; } -static CvDC1394 dc1394; +static CvDC1394& getDC1394() +{ + static CvDC1394 dc1394; + return dc1394; +} class CvCaptureCAM_DC1394_v2_CPP : public CvCapture { @@ -451,7 +455,7 @@ bool CvCaptureCAM_DC1394_v2_CPP::startCapture() code = dc1394_capture_setup(dcCam, nDMABufs, DC1394_CAPTURE_FLAGS_DEFAULT); if (code >= 0) { - FD_SET(dc1394_capture_get_fileno(dcCam), &dc1394.camFds); + FD_SET(dc1394_capture_get_fileno(dcCam), &getDC1394().camFds); dc1394_video_set_transmission(dcCam, DC1394_ON); if (cameraId == VIDERE) { @@ -477,15 +481,15 @@ bool CvCaptureCAM_DC1394_v2_CPP::open(int index) close(); - if (!dc1394.dc) + if (!getDC1394().dc) goto _exit_; - err = dc1394_camera_enumerate(dc1394.dc, &cameraList); + err = dc1394_camera_enumerate(getDC1394().dc, &cameraList); if (err < 0 || !cameraList || (unsigned)index >= (unsigned)cameraList->num) goto _exit_; guid = cameraList->ids[index].guid; - dcCam = dc1394_camera_new(dc1394.dc, guid); + dcCam = dc1394_camera_new(getDC1394().dc, guid); if (!dcCam) goto _exit_; @@ -510,8 +514,8 @@ void CvCaptureCAM_DC1394_v2_CPP::close() // check for fileno valid before using int fileno=dc1394_capture_get_fileno(dcCam); - if (fileno>=0 && FD_ISSET(fileno, &dc1394.camFds)) - FD_CLR(fileno, &dc1394.camFds); + if (fileno>=0 && FD_ISSET(fileno, &getDC1394().camFds)) + FD_CLR(fileno, &getDC1394().camFds); dc1394_video_set_transmission(dcCam, DC1394_OFF); dc1394_capture_stop(dcCam); dc1394_camera_free(dcCam); From 5736bf5dd524d28b1f80696662b725d196084d58 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 16 Dec 2018 06:25:39 +0000 Subject: [PATCH 07/13] stitching: fix l_gains data type from Eigen solver (float / double) --- modules/stitching/src/exposure_compensate.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp index 2488684912..8ce2dda2d9 100644 --- a/modules/stitching/src/exposure_compensate.cpp +++ b/modules/stitching/src/exposure_compensate.cpp @@ -180,10 +180,13 @@ void GainCompensator::feed(const std::vector &corners, const std::vector< #endif eigen_x = solver.solve(eigen_b); - eigen2cv(eigen_x, l_gains); + Mat_ l_gains_float; + eigen2cv(eigen_x, l_gains_float); + l_gains_float.convertTo(l_gains, CV_64FC1); #else solve(A, b, l_gains); #endif + CV_CheckTypeEQ(l_gains.type(), CV_64FC1, ""); gains_.create(num_images, 1); for (int i = 0, j = 0; i < num_images; ++i) From f605898bae8e2692c059e4199395354ea3555c56 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 16 Dec 2018 06:43:08 +0000 Subject: [PATCH 08/13] core: fix eigen2cv() - don't change fixed type of 'dst' --- modules/core/include/opencv2/core/eigen.hpp | 2 +- modules/core/test/test_mat.cpp | 24 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/modules/core/include/opencv2/core/eigen.hpp b/modules/core/include/opencv2/core/eigen.hpp index c8603aca97..741648edb8 100644 --- a/modules/core/include/opencv2/core/eigen.hpp +++ b/modules/core/include/opencv2/core/eigen.hpp @@ -60,7 +60,7 @@ namespace cv //! @{ template static inline -void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, Mat& dst ) +void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, OutputArray dst ) { if( !(src.Flags & Eigen::RowMajorBit) ) { diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp index 91a93539fc..f585c4f28f 100644 --- a/modules/core/test/test_mat.cpp +++ b/modules/core/test/test_mat.cpp @@ -3,6 +3,12 @@ // of this distribution and at http://opencv.org/license.html. #include "test_precomp.hpp" +#ifdef HAVE_EIGEN +#include +#include +#include "opencv2/core/eigen.hpp" +#endif + namespace opencv_test { namespace { class Core_ReduceTest : public cvtest::BaseTest @@ -1972,4 +1978,22 @@ TEST(Core_Vectors, issue_13078_workaround) ASSERT_EQ(7, ints[3]); } + +#ifdef HAVE_EIGEN +TEST(Core_Eigen, eigen2cv_check_Mat_type) +{ + Mat A(4, 4, CV_32FC1, Scalar::all(0)); + Eigen::MatrixXf eigen_A; + cv2eigen(A, eigen_A); + + Mat_ f_mat; + EXPECT_NO_THROW(eigen2cv(eigen_A, f_mat)); + EXPECT_EQ(CV_32FC1, f_mat.type()); + + Mat_ d_mat; + EXPECT_ANY_THROW(eigen2cv(eigen_A, d_mat)); + //EXPECT_EQ(CV_64FC1, d_mat.type()); +} +#endif // HAVE_EIGEN + }} // namespace From bb8c19aad3ce334d493873f51bd0381a45e209ce Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 17 Dec 2018 13:39:26 +0300 Subject: [PATCH 09/13] cmake: fix python install paths --- CMakeLists.txt | 2 +- cmake/OpenCVGenSetupVars.cmake | 21 ++++++++++++--- modules/python/CMakeLists.txt | 1 + modules/python/common.cmake | 15 +++++++++++ modules/python/python_loader.cmake | 43 +++++++++++++++++++----------- modules/python/standalone.cmake | 8 +++++- 6 files changed, 68 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fcfef8940b..f17220b9e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -608,7 +608,7 @@ else() endif() endif() ocv_update(OPENCV_INCLUDE_INSTALL_PATH "include") - ocv_update(OPENCV_PYTHON_INSTALL_PATH "python") + #ocv_update(OPENCV_PYTHON_INSTALL_PATH "python") # no default value, see https://github.com/opencv/opencv/issues/13202 endif() ocv_update(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}") diff --git a/cmake/OpenCVGenSetupVars.cmake b/cmake/OpenCVGenSetupVars.cmake index 7d49b8c867..61d0088dcc 100644 --- a/cmake/OpenCVGenSetupVars.cmake +++ b/cmake/OpenCVGenSetupVars.cmake @@ -43,11 +43,24 @@ else() endif() file(RELATIVE_PATH OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${CMAKE_INSTALL_PREFIX}/${OPENCV_SETUPVARS_INSTALL_PATH}/" "${CMAKE_INSTALL_PREFIX}/") -if(IS_ABSOLUTE "${OPENCV_PYTHON_INSTALL_PATH}") - set(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${OPENCV_PYTHON_INSTALL_PATH}") - message(WARNING "CONFIGURATION IS NOT SUPPORTED: validate setupvars script in install directory") +if(DEFINED OPENCV_PYTHON_INSTALL_PATH) + set(__python_path "${OPENCV_PYTHON_INSTALL_PATH}") +elseif(DEFINED OPENCV_PYTHON_INSTALL_PATH_SETUPVARS) + set(__python_path "${OPENCV_PYTHON_INSTALL_PATH_SETUPVARS}") +endif() +if(DEFINED __python_path) + if(IS_ABSOLUTE "${__python_path}") + set(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${__python_path}") + message(WARNING "CONFIGURATION IS NOT SUPPORTED: validate setupvars script in install directory") + else() + ocv_path_join(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG}" "${__python_path}") + endif() else() - ocv_path_join(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG}" "${OPENCV_PYTHON_INSTALL_PATH}") + if(DEFINED OPENCV_PYTHON3_INSTALL_PATH) + ocv_path_join(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "${OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG}" "${OPENCV_PYTHON3_INSTALL_PATH}") + else() + set(OPENCV_PYTHON_DIR_RELATIVE_CMAKECONFIG "python_loader_is_not_installed") + endif() endif() configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/${OPENCV_SETUPVARS_TEMPLATE}" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/install/${OPENCV_SETUPVARS_FILENAME}" @ONLY) install(FILES "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/install/${OPENCV_SETUPVARS_FILENAME}" diff --git a/modules/python/CMakeLists.txt b/modules/python/CMakeLists.txt index 27874283e7..fbf01d6e82 100644 --- a/modules/python/CMakeLists.txt +++ b/modules/python/CMakeLists.txt @@ -20,6 +20,7 @@ add_subdirectory(bindings) if(NOT OPENCV_SKIP_PYTHON_LOADER) include("./python_loader.cmake") + message(STATUS "OpenCV Python: during development append to PYTHONPATH: ${CMAKE_BINARY_DIR}/python_loader") endif() if(__disable_python2) diff --git a/modules/python/common.cmake b/modules/python/common.cmake index 4b4eaa6e7f..c65cddcf7f 100644 --- a/modules/python/common.cmake +++ b/modules/python/common.cmake @@ -120,6 +120,21 @@ if(NOT OPENCV_SKIP_PYTHON_LOADER) set(__python_loader_subdir "cv2/") endif() +if(NOT " ${PYTHON}" STREQUAL " PYTHON" + AND NOT DEFINED OPENCV_PYTHON_INSTALL_PATH +) + if(DEFINED OPENCV_${PYTHON}_INSTALL_PATH) + set(OPENCV_PYTHON_INSTALL_PATH "${OPENCV_${PYTHON}_INSTALL_PATH}") + elseif(NOT OPENCV_SKIP_PYTHON_LOADER) + set(OPENCV_PYTHON_INSTALL_PATH "${${PYTHON}_PACKAGES_PATH}") + endif() +endif() + +if(NOT OPENCV_SKIP_PYTHON_LOADER AND DEFINED OPENCV_PYTHON_INSTALL_PATH) + include("${CMAKE_CURRENT_LIST_DIR}/python_loader.cmake") + set(OPENCV_PYTHON_INSTALL_PATH_SETUPVARS "${OPENCV_PYTHON_INSTALL_PATH}" CACHE INTERNAL "") +endif() + if(NOT " ${PYTHON}" STREQUAL " PYTHON" AND DEFINED OPENCV_${PYTHON}_INSTALL_PATH) set(__python_binary_install_path "${OPENCV_${PYTHON}_INSTALL_PATH}") elseif(OPENCV_SKIP_PYTHON_LOADER AND DEFINED ${PYTHON}_PACKAGES_PATH) diff --git a/modules/python/python_loader.cmake b/modules/python/python_loader.cmake index 59ce8e5d69..663be5c824 100644 --- a/modules/python/python_loader.cmake +++ b/modules/python/python_loader.cmake @@ -2,20 +2,24 @@ ocv_assert(NOT OPENCV_SKIP_PYTHON_LOADER) set(PYTHON_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}") -ocv_assert(DEFINED OPENCV_PYTHON_INSTALL_PATH) if(OpenCV_FOUND) set(__loader_path "${OpenCV_BINARY_DIR}/python_loader") + message(STATUS "OpenCV Python: during development append to PYTHONPATH: ${__loader_path}") else() set(__loader_path "${CMAKE_BINARY_DIR}/python_loader") endif() set(__python_loader_install_tmp_path "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/install/python_loader/") -if(IS_ABSOLUTE "${OPENCV_PYTHON_INSTALL_PATH}") - set(OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/") - set(CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE "'${CMAKE_INSTALL_PREFIX}'") +if(DEFINED OPENCV_PYTHON_INSTALL_PATH) + if(IS_ABSOLUTE "${OPENCV_PYTHON_INSTALL_PATH}") + set(OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/") + set(CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE "'${CMAKE_INSTALL_PREFIX}'") + else() + file(RELATIVE_PATH OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${OPENCV_PYTHON_INSTALL_PATH}/cv2" ${CMAKE_INSTALL_PREFIX}) + set(CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE "os.path.join(LOADER_DIR, '${OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE}')") + endif() else() - file(RELATIVE_PATH OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${OPENCV_PYTHON_INSTALL_PATH}/cv2" ${CMAKE_INSTALL_PREFIX}) - set(CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE "os.path.join(LOADER_DIR, '${OpenCV_PYTHON_INSTALL_PATH_RELATIVE_CONFIGCMAKE}')") + set(CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE "os.path.join(LOADER_DIR, 'not_installed')") endif() set(PYTHON_LOADER_FILES @@ -25,7 +29,13 @@ set(PYTHON_LOADER_FILES foreach(fname ${PYTHON_LOADER_FILES}) get_filename_component(__dir "${fname}" DIRECTORY) file(COPY "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${__loader_path}/${__dir}") - install(FILES "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/${__dir}" COMPONENT python) + if(fname STREQUAL "setup.py") + if(OPENCV_PYTHON_SETUP_PY_INSTALL_PATH) + install(FILES "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${OPENCV_PYTHON_SETUP_PY_INSTALL_PATH}" COMPONENT python) + endif() + elseif(DEFINED OPENCV_PYTHON_INSTALL_PATH) + install(FILES "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/${__dir}" COMPONENT python) + endif() endforeach() if(NOT OpenCV_FOUND) # Ignore "standalone" builds of Python bindings @@ -41,14 +51,15 @@ if(NOT OpenCV_FOUND) # Ignore "standalone" builds of Python bindings string(REPLACE ";" ",\n " CMAKE_PYTHON_BINARIES_PATH "${CMAKE_PYTHON_BINARIES_PATH}") configure_file("${PYTHON_SOURCE_DIR}/package/template/config.py.in" "${__loader_path}/cv2/config.py" @ONLY) - if(WIN32) - list(APPEND CMAKE_PYTHON_BINARIES_INSTALL_PATH "os.path.join(${CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE}, '${OPENCV_BIN_INSTALL_PATH}')") - else() - list(APPEND CMAKE_PYTHON_BINARIES_INSTALL_PATH "os.path.join(${CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE}, '${OPENCV_LIB_INSTALL_PATH}')") + # install + if(DEFINED OPENCV_PYTHON_INSTALL_PATH) + if(WIN32) + list(APPEND CMAKE_PYTHON_BINARIES_INSTALL_PATH "os.path.join(${CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE}, '${OPENCV_BIN_INSTALL_PATH}')") + else() + list(APPEND CMAKE_PYTHON_BINARIES_INSTALL_PATH "os.path.join(${CMAKE_PYTHON_EXTENSION_INSTALL_PATH_BASE}, '${OPENCV_LIB_INSTALL_PATH}')") + endif() + string(REPLACE ";" ",\n " CMAKE_PYTHON_BINARIES_PATH "${CMAKE_PYTHON_BINARIES_INSTALL_PATH}") + configure_file("${PYTHON_SOURCE_DIR}/package/template/config.py.in" "${__python_loader_install_tmp_path}/cv2/config.py" @ONLY) + install(FILES "${__python_loader_install_tmp_path}/cv2/config.py" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/cv2/" COMPONENT python) endif() - string(REPLACE ";" ",\n " CMAKE_PYTHON_BINARIES_PATH "${CMAKE_PYTHON_BINARIES_INSTALL_PATH}") - configure_file("${PYTHON_SOURCE_DIR}/package/template/config.py.in" "${__python_loader_install_tmp_path}/cv2/config.py" @ONLY) - install(FILES "${__python_loader_install_tmp_path}/cv2/config.py" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/cv2/" COMPONENT python) - - message(STATUS "OpenCV Python: during development append to PYTHONPATH: ${__loader_path}") endif() diff --git a/modules/python/standalone.cmake b/modules/python/standalone.cmake index 584815175f..a27bba0b28 100644 --- a/modules/python/standalone.cmake +++ b/modules/python/standalone.cmake @@ -3,7 +3,13 @@ if(NOT DEFINED OpenCV_BINARY_DIR) endif() include("${OpenCV_BINARY_DIR}/opencv_python_config.cmake") if(NOT DEFINED OpenCV_SOURCE_DIR) - message(FATAL_ERROR "Missing define of OpenCV_SOURCE_DIR") + message(FATAL_ERROR "Missing OpenCV_SOURCE_DIR") +endif() +if(NOT OPENCV_PYTHON_INSTALL_PATH) + if(NOT DEFINED OPENCV_PYTHON_STANDALONE_INSTALL_PATH) + message(FATAL_ERROR "Missing OPENCV_PYTHON_STANDALONE_INSTALL_PATH / OPENCV_PYTHON_INSTALL_PATH") + endif() + set(OPENCV_PYTHON_INSTALL_PATH "${OPENCV_PYTHON_STANDALONE_INSTALL_PATH}") endif() include("${OpenCV_SOURCE_DIR}/cmake/OpenCVUtils.cmake") From 3eb2c940ded61858b037ffd25d0eaa96a448fbc4 Mon Sep 17 00:00:00 2001 From: vishwesh5 Date: Mon, 17 Dec 2018 20:39:22 +0530 Subject: [PATCH 10/13] Fix Scharr and Sobel functions Resolves #13375 --- modules/imgproc/src/deriv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp index 5001591989..f4095a1086 100644 --- a/modules/imgproc/src/deriv.cpp +++ b/modules/imgproc/src/deriv.cpp @@ -441,7 +441,7 @@ void cv::Sobel( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy, ocl_sepFilter3x3_8UC1(_src, _dst, ddepth, kx, ky, delta, borderType)); CV_OCL_RUN(ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, ddepth, kx, ky, Point(-1, -1), 0, borderType)) + ocl_sepFilter2D(_src, _dst, ddepth, kx, ky, Point(-1, -1), delta, borderType)) Mat src = _src.getMat(); Mat dst = _dst.getMat(); @@ -494,7 +494,7 @@ void cv::Scharr( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy, CV_OCL_RUN(ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, ddepth, kx, ky, Point(-1, -1), 0, borderType)) + ocl_sepFilter2D(_src, _dst, ddepth, kx, ky, Point(-1, -1), delta, borderType)) Mat src = _src.getMat(); Mat dst = _dst.getMat(); From d9d9b05912e992aa49d773ecace5a8bde958aa86 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 17 Dec 2018 18:31:49 +0000 Subject: [PATCH 11/13] core(ocl): add parameter to limit device max workgroup size used by OpenCV --- modules/core/src/ocl.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 0bf4e07856..ea4954ec33 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -1257,6 +1257,14 @@ struct Device::Impl else vendorID_ = UNKNOWN_VENDOR; + const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0); + if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0) + { + const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE); + if (new_maxWorkGroupSize != maxWorkGroupSize_) + CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")"); + maxWorkGroupSize_ = new_maxWorkGroupSize; + } #if 0 if (isExtensionSupported("cl_khr_spir")) { From 715f8fcce091a9d41478f5709486001cca7d9920 Mon Sep 17 00:00:00 2001 From: vishwesh5 Date: Tue, 18 Dec 2018 16:10:04 +0530 Subject: [PATCH 12/13] Merge pull request #13432 from vishwesh5:patch-1 * Create text_detection.py #12270 #13429 **Deep Learning text detection sample (Python)** - Tested on **Ubuntu 18.04** - OpenCV 3.4.3, OpenCV 3.4.4, OpenCV 4.0 (master branch) - Python version supported - Python 2 and Python 3 * Fix trailing whitespaces * Update text_detection.py * Remove whitespace * Remove comments * Remove unused packages * Update description --- samples/dnn/text_detection.py | 146 ++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 samples/dnn/text_detection.py diff --git a/samples/dnn/text_detection.py b/samples/dnn/text_detection.py new file mode 100644 index 0000000000..9f7f159a54 --- /dev/null +++ b/samples/dnn/text_detection.py @@ -0,0 +1,146 @@ +# Import required modules +import cv2 as cv +import math +import argparse + +############ Add argument parser for command line arguments ############ +parser = argparse.ArgumentParser(description='Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)') +parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.') +parser.add_argument('--model', required=True, + help='Path to a binary .pb file of model contains trained weights.') +parser.add_argument('--width', type=int, default=320, + help='Preprocess input image by resizing to a specific width. It should be multiple by 32.') +parser.add_argument('--height',type=int, default=320, + help='Preprocess input image by resizing to a specific height. It should be multiple by 32.') +parser.add_argument('--thr',type=float, default=0.5, + help='Confidence threshold.') +parser.add_argument('--nms',type=float, default=0.4, + help='Non-maximum suppression threshold.') +args = parser.parse_args() + +############ Utility functions ############ +def decode(scores, geometry, scoreThresh): + detections = [] + confidences = [] + + ############ CHECK DIMENSIONS AND SHAPES OF geometry AND scores ############ + assert len(scores.shape) == 4, "Incorrect dimensions of scores" + assert len(geometry.shape) == 4, "Incorrect dimensions of geometry" + assert scores.shape[0] == 1, "Invalid dimensions of scores" + assert geometry.shape[0] == 1, "Invalid dimensions of geometry" + assert scores.shape[1] == 1, "Invalid dimensions of scores" + assert geometry.shape[1] == 5, "Invalid dimensions of geometry" + assert scores.shape[2] == geometry.shape[2], "Invalid dimensions of scores and geometry" + assert scores.shape[3] == geometry.shape[3], "Invalid dimensions of scores and geometry" + height = scores.shape[2] + width = scores.shape[3] + for y in range(0, height): + + # Extract data from scores + scoresData = scores[0][0][y] + x0_data = geometry[0][0][y] + x1_data = geometry[0][1][y] + x2_data = geometry[0][2][y] + x3_data = geometry[0][3][y] + anglesData = geometry[0][4][y] + for x in range(0, width): + score = scoresData[x] + + # If score is lower than threshold score, move to next x + if(score < scoreThresh): + continue + + # Calculate offset + offsetX = x * 4.0 + offsetY = y * 4.0 + angle = anglesData[x] + + # Calculate cos and sin of angle + cosA = math.cos(angle) + sinA = math.sin(angle) + h = x0_data[x] + x2_data[x] + w = x1_data[x] + x3_data[x] + + # Calculate offset + offset = ([offsetX + cosA * x1_data[x] + sinA * x2_data[x], offsetY - sinA * x1_data[x] + cosA * x2_data[x]]) + + # Find points for rectangle + p1 = (-sinA * h + offset[0], -cosA * h + offset[1]) + p3 = (-cosA * w + offset[0], sinA * w + offset[1]) + center = (0.5*(p1[0]+p3[0]), 0.5*(p1[1]+p3[1])) + detections.append((center, (w,h), -1*angle * 180.0 / math.pi)) + confidences.append(float(score)) + + # Return detections and confidences + return [detections, confidences] + +def main(): + # Read and store arguments + confThreshold = args.thr + nmsThreshold = args.nms + inpWidth = args.width + inpHeight = args.height + model = args.model + + # Load network + net = cv.dnn.readNet(model) + + # Create a new named window + kWinName = "EAST: An Efficient and Accurate Scene Text Detector" + cv.namedWindow(kWinName, cv.WINDOW_NORMAL) + outNames = [] + outNames.append("feature_fusion/Conv_7/Sigmoid") + outNames.append("feature_fusion/concat_3") + + # Open a video file or an image file or a camera stream + cap = cv.VideoCapture(args.input if args.input else 0) + + while cv.waitKey(1) < 0: + # Read frame + hasFrame, frame = cap.read() + if not hasFrame: + cv.waitKey() + break + + # Get frame height and width + height_ = frame.shape[0] + width_ = frame.shape[1] + rW = width_ / float(inpWidth) + rH = height_ / float(inpHeight) + + # Create a 4D blob from frame. + blob = cv.dnn.blobFromImage(frame, 1.0, (inpWidth, inpHeight), (123.68, 116.78, 103.94), True, False) + + # Run the model + net.setInput(blob) + outs = net.forward(outNames) + t, _ = net.getPerfProfile() + label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) + + # Get scores and geometry + scores = outs[0] + geometry = outs[1] + [boxes, confidences] = decode(scores, geometry, confThreshold) + + # Apply NMS + indices = cv.dnn.NMSBoxesRotated(boxes, confidences, confThreshold,nmsThreshold) + for i in indices: + # get 4 corners of the rotated rect + vertices = cv.boxPoints(boxes[i[0]]) + # scale the bounding box coordinates based on the respective ratios + for j in range(4): + vertices[j][0] *= rW + vertices[j][1] *= rH + for j in range(4): + p1 = (vertices[j][0], vertices[j][1]) + p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1]) + cv.line(frame, p1, p2, (0, 255, 0), 1); + + # Put efficiency information + cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) + + # Display the frame + cv.imshow(kWinName,frame) + +if __name__ == "__main__": + main() From 50ef9830e29fe06a4a3eb34b845d266047cab1b2 Mon Sep 17 00:00:00 2001 From: "Peter J. Stieber" Date: Mon, 17 Dec 2018 14:47:26 -0800 Subject: [PATCH 13/13] Added Turing to the _generations list. --- cmake/OpenCVDetectCUDA.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index 2d94a2c86b..18ab2f8289 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -52,7 +52,7 @@ if(CUDA_FOUND) message(STATUS "CUDA detected: " ${CUDA_VERSION}) - set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta") + set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "Turing") if(NOT CMAKE_CROSSCOMPILING) list(APPEND _generations "Auto") endif()